# -*- coding: utf-8 -*-

'''
Created on 2010-1-15

@author: Michael
'''

from sgmllib import SGMLParser
import htmlentitydefs
import string

class BaseProcessor(SGMLParser):
    def reset(self):
        # extend (called by SGMLParser.__init__)
        self.pieces = []
        self.verbatim = 1
        SGMLParser.reset(self)
        
    def unknown_starttag(self, tag, attrs):
        # called for each start tag
        # Reconstruct the original start tag.
        strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
        self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
        
    def unknown_endtag(self, tag):
        # called for each end tag, e.g. for </pre>, tag will be "pre"
        # Reconstruct the original end tag.
        self.pieces.append("</%(tag)s>" % locals())

    def handle_charref(self, ref):
        # called for each character reference, e.g. for "&#160;", ref will be "160"
        # Reconstruct the original character reference.
        self.pieces.append("&#%(ref)s;" % locals())
        
    def handle_data(self, text):
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        # Store the original text verbatim.
        if self.verbatim:
            self.pieces.append(text)
        
    def handle_comment(self, text):
        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
        # Just delete the original comment.
        pass
        
    def handle_pi(self, text):
        # called for each processing instruction, e.g. <?instruction>
        # Reconstruct original processing instruction.
        self.pieces.append("<?%(text)s>" % locals())

    def handle_decl(self, text):
        # called for the DOCTYPE, if present, e.g.
        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        #     "http://www.w3.org/TR/html4/loose.dtd">
        # Reconstruct original DOCTYPE
        pass
        
    def handle_entityref(self, ref):
        # called for each entity reference, e.g. for "&copy;", ref will be "copy"
        # Reconstruct the original entity reference.
        # standard HTML entities are closed with a semicolon; other entities are not
        if htmlentitydefs.entitydefs.has_key(ref):
            if ref == 'nbsp':
                self.pieces.append("&%(ref)s" % locals())
                self.pieces.append(";")
    
    def output(self):
        """Return processed HTML as a single string"""
        return "".join(self.pieces)
    
        
class IMGLister(BaseProcessor):
    def reset(self):
        BaseProcessor.reset(self)
        self.pics = []
        
    def start_img(self, attrs):
        # called for every <img> tag in HTML source
        for key, value in attrs:
            if string.lower(key)=='src':
                self.pics.append(value)

#    def end_img(self):
#        # called for every </img> tag in HTML source
#        pass
        
    
class URLLister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.key=""
        self.value=""
        #valid data flag
        self.verbatim=0
        #store title and url pairs, ie:[[title,url],[title,url]...]
        self.links = []
        
    def handle_data(self, text):
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        # Store the original text verbatim.
        if self.verbatim and text:
            self.key=text.strip()

    def start_a(self, attrs):
        for k, v in attrs:
            if k=='href':
                href=v
                break
        if href:
            self.value=href.strip()
            self.verbatim=1
    
    def end_a(self):
        # called for every </a> tag in HTML source
        if self.key and self.value:
            self.links.append([self.key,self.value])
        self.verbatim = 0

class ScriptLinkLister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
#        self.value=""
#        self.href=""
        self.links = []

    def start_script(self, attrs):
        for k, v in attrs:
            if k=='src':
                if v:
                    self.links.append(v)
                break
#        if self.href:
#            self.value=href.strip()

#    def end_script(self):
#        if self.href and self.value:
#            self.links.append(self.value)

class Processor(BaseProcessor):
    def reset(self):
        # extend (called by SGMLParser.__init__)
        BaseProcessor.reset(self)
        self.startSign=""
        self.endSign=""
        self.tagsToKeep=[]
        self.tagsToRemove=[]
        

    def init(self, website):
        self.chapterStartSign=website["chapterStartSign"]
        self.chapterEndSign=website["chapterEndSign"]
        self.catalogStartSign=website["catalogStartSign"]
        self.catalogEndSign=website["catalogEndSign"]
        self.chapterTagsToKeep=website["chapterTagsToKeep"]
        self.chapterTagsToRemove=website["chapterTagsToRemove"]
        self.catalogTagsToKeep=website["catalogTagsToKeep"]
        self.catalogTagsToRemove=website["catalogTagsToRemove"]
        self.filterWords=website["filterWords"]
        self.searchStartSign=website["searchStartSign"]
        self.searchEndSign=website["searchEndSign"]
        self.searchTagsToKeep=website["searchTagsToKeep"]
        self.searchTagsToRemove=website["searchTagsToRemove"]
        self.searchResultPattern=website["searchResultPattern"]
#        print locals()

    def process_chapter(self,content):
        self.reset()
        self.startSign=self.chapterStartSign
        self.endSign=self.chapterEndSign
        self.tagsToKeep=self.chapterTagsToKeep
        self.tagsToRemove=self.chapterTagsToRemove
        self.feed(content)
        s=self.output()
#        print "!!!!!!!!!!!!"
#        print unicode(s, "gbk")
        if s!=None and s!="":
            s=self.filter(self.split(s))
#        print "@@@@@@@@@@@@@@"
#        print unicode(s, "gbk")
        imgLister=IMGLister()
        imgLister.feed(s)
        imgLister.close()
        scriptLinkLister=ScriptLinkLister()
        scriptLinkLister.feed(s)
        scriptLinkLister.close()
        result={}
        result["string"]=imgLister.output()
        result["pics"]=imgLister.pics
        result["links"]=scriptLinkLister.links
        return result

    def process_catalog(self,content):
        self.reset()
        self.startSign=self.catalogStartSign
        self.endSign=self.catalogEndSign
        self.tagsToKeep=self.catalogTagsToKeep
        self.tagsToRemove=self.catalogTagsToRemove
        self.feed(content)
        s=self.output()
#        print "!!!!!!!!!!!!"
#        print unicode(s, "gbk")
        if s!=None and s!="":
            s=self.filter(self.split(s))
#        print "@@@@@@@@@@@@@@"
#        print unicode(s, "gbk")
        urlLister=URLLister()
        urlLister.feed(s)
        urlLister.close()
        return urlLister.links

    def process_search(self,content):
        """
        return value:a list of search result
        search result is a dict, which keys come from the searchReasultPattern, and values are link
        [{  "name":["book's name","a href link"],
            "author":["author's name","a href link"],
            "category":["book's category","a href link"],
            "lastestChapter":["lastestChapter's name","a href link"]
            },
            {  "name":["book's name","a href link"],
            "author":["author's name","a href link"],
            "category":["category's name","a href link"],
            "lastestChapter":["lastestChapter's name","a href link"]
            },
            ...
        ]
        """
        self.reset()
        self.startSign=self.searchStartSign
        self.endSign=self.searchEndSign
        self.tagsToKeep=self.searchTagsToKeep
        self.tagsToRemove=self.searchTagsToRemove
        self.feed(content)
        s=self.output()
#        print "!!!!!!!!!!!!"
#        print unicode(s, "gbk")
        if s!=None and s!="":
            s=self.split(s)
#        print "@@@@@@@@@@@@@@"
#        print unicode(s, "gbk")
        urlLister=URLLister()
        urlLister.feed(s)
        urlLister.close()
        links=urlLister.links
        searchResults=[]

        patternNum=len(self.searchResultPattern)
        if patternNum==0:
            return searchResults
        result={}
        i=0
        for link in links:
            seq=i%patternNum
            if seq==0:
                result={}
            result[self.searchResultPattern[seq]]=link
            if seq==patternNum-1:
                searchResults.append(result)
            i+=1
        return searchResults

    def split(self, content):
        beginIndex = 0
        endIndex = len(content)
        if self.startSign is not None and len(self.startSign.strip())>0:
            beginIndex = string.find(content, self.startSign.encode("gbk"))
        if self.endSign is not None and len(self.endSign.strip())>0:
            endIndex = string.find(content, self.endSign.encode("gbk"))
#        print beginIndex
#        print endIndex
        if beginIndex == -1:
            beginIndex = 0
        else:
            beginIndex = beginIndex + len(self.startSign)
        if endIndex == -1:
            endIndex = len(content)
#        print beginIndex
#        print endIndex
        return content[beginIndex:endIndex]

    def filter(self, content):
        for word in self.filterWords:
            word = word.encode("gbk")
            content = string.join(string.split(content, word))
        return content

    def unknown_starttag(self, tag, attrs):
        # called for each start tag
        # Reconstruct the original start tag.
        if tag in self.tagsToKeep:
            strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
            self.verbatim=1
        elif tag in self.tagsToRemove:
            self.verbatim=0
        else:
            strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
            self.verbatim=0

    def unknown_endtag(self, tag):
        # called for each end tag, e.g. for </pre>, tag will be "pre"
        # Reconstruct the original end tag.
#        if tag in self.tagsToKeep:
##            self.verbatim=0
#            self.pieces.append("</%(tag)s>" % locals())
        if tag in self.tagsToRemove:
            self.verbatim=1
        else:
            self.pieces.append("</%(tag)s>" % locals())
            self.verbatim=1


    
