# coding: utf-8

import urllib
import urllib2
import cookielib
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
from BeautifulSoup import Comment
from BeautifulSoup import CData
from BeautifulSoup import NavigableString
from BeautifulSoup import Tag
from urlparse import urljoin
from urlparse import urlparse
from urlparse import urlsplit
from urlparse import urlunsplit
from datetime import datetime
from datetime import date
from datetime import timedelta
import uuid
import os
import re
import locale
import glob


month_names = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, \
    "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12}
zone_names = {"UT": "+0000", "GMT": "+0000", "EST": "-0500", "EDT": "-0400", \
    "CST": "-0600", "CDT": "-0500", "MST": "-0700", "MDT": "-0600", \
    "PST": "-0800", "PDT": "-0700", \
    "Z": "+0000", "A": "-0100", "M": "-1200", "N": "+0100", "Y": "+1200"}
massage = [(re.compile('<([^<>\s]+)([^<>]*)/>'), \
    lambda x: "<" + x.group(1) + x.group(2) + "></" + x.group(1) + ">")]


def parse_date(str):
    """ RSS or Atom feed date parsing.

    RSS and Atom feeds contains locale independent dates so it is impossible to use strptime().
    """
    str = str.strip()
    parts = str.split(" ")
    (y, m, d, hh, mm, ss, ms) = (0, 1, 1, 0, 0, 0, 0)
    if len(parts) >= 5:
        if parts[0].find(",") >= 0:
            parts = parts[1:]
        zone = parts[-1]
        if zone_names.has_key(parts[-1]):
            zone = zone_names[parts[-1]]
        time_parts = parts[3].split(":")
        if len(time_parts) == 3:
            ss = int(time_parts[2])
        (y, m, d, hh, mm) = (int(parts[2]), month_names[parts[1]], int(parts[0]), \
            int(time_parts[0]), int(time_parts[1]))
    else:
        zone = "+0000"
        if str[-1] == "Z":
            str = str[:-1]
        elif str[-6] == "-" or str[-6] == "+":
            zone = str[-6:].replace(":", "")
            str = str[:-6]
        parts = str.split(" ")
        if len(parts) == 1:
            parts = str.strip().split("T")
        date_parts = parts[0].split("-")
        y = int(date_parts[0])
        if len(date_parts) > 1:
            m = int(date_parts[1])
        if len(date_parts) > 2:
            d = int(date_parts[2])
        if len(parts) > 1:
            time_parts = parts[1].split(":")
            hh = int(time_parts[0])
            mm = int(time_parts[1][:2])
            if len(time_parts) > 2:
                ss = int(time_parts[2][:2])
                if time_parts[2].find(".") >= 0:
                    ms = int(time_parts[2][3:]) * 1000
    (dh, dm) = (int(zone[1:3]), int(zone[3:]))
    if zone[0] == "-":
        dh *= -1
    return (datetime(y, m, d, hh, mm, ss, ms), timedelta(minutes = dm, hours = dh))


def create_url_opener():
    """ Opener is for opening urls.

    Opener tries to use either microb (on N800) or firefox (on PC) cookies.
    """
    filename = os.path.join(os.environ["HOME"], ".mozilla/microb/cookies.txt")
    if not os.path.exists(filename):
        files = glob.glob(os.path.join(os.environ["HOME"], ".mozilla/firefox/*default/cookies.txt"))
        if len(files):
            filename = files[0]
        else:
            filename = None
    if filename:
        cj = cookielib.MozillaCookieJar()
        cj.load(filename)
        return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    return urllib2.build_opener()


urlopener = create_url_opener()


def quote_url_path(url):
    p = urlsplit(url)
    return urlunsplit((p[0], p[1], urllib.quote(p[2]), p[3], p[4]))


def get_or_create_tag(html, list, tag = None):
    if not tag:
        tag = html
    sub = tag.find(list[0])
    if not sub:
        sub = Tag(html, list[0])
        tag.append(sub)
    if len(list) > 1:
        return get_or_create_tag(html, list[1:], tag = sub)
    else:
        return sub


def get_page_encoding(page):
    """ Not all html documents has http-equiv meta, so this retrieves real http headers.
    """
    if page.info().has_key("Content-Type"):
        params = map(lambda nvs: nvs.split("="), page.info()["Content-Type"].split(";"))
        charsets = filter(lambda nv: nv[0].strip() == "charset" and len(nv) > 1, params)
        if len(charsets):
            return charsets[0][1].replace("'", "").replace('"', "").strip()
    return None


def get_page_base(page, html):
    base = html.find("base")
    if base:
        return base["href"]
    return page.geturl()


def get_page_html_only(url, **kwargs):
    """ Retrieves and parses html page.

    kwargs are passed to Request object constructor.
    """
    page = urlopener.open(urllib2.Request(url=quote_url_path(url), **kwargs))
    html = BeautifulSoup(page, fromEncoding = get_page_encoding(page))
    base = get_page_base(page, html)
    return (page, html, base)


refresh_reqex = re.compile("refresh", re.IGNORECASE)


def try_skip_banner(page, html, base, **kwargs):
    """ This may help avoid ads on some resources.

    When web page contains Refresh in its meta it is likely that 
    it is banner so we try to reload it immediately.
    """
    refresh = None
    if page.info().has_key("Refresh"):
        refresh = page.info()["Refresh"]
    else:
        tag = html.find(name = "meta", attrs = {"http-equiv": refresh_reqex})
        if tag:
            refresh = tag["content"]
    if refresh:
        new_url = page.geturl()
        ind = refresh.lower().find("url=")
        if ind >= 0:
            new_url = urljoin(base, refresh[ind + 4:])
        return get_page_html_only(new_url, **kwargs)
    return (page, html, base)


def get_page_html(url, **kwargs):
    (page, html, base) = get_page_html_only(url, **kwargs)
    #(page, html, base) = try_skip_banner(page, html, base, **kwargs)
    return (html, base, page.geturl())

def get_feed_title(xml):
    chan = xml.find(["channel", "feed"])
    if chan:
        title = chan.find("title", recursive = False)
        if title:
            return title.string
    return "untitled"


def discover_feed(url):
    """ Returns RSS or Atom feed url from related html page.

    Not only RSS or Atom feed urls may be entered by user, so
    this function check what is passed and if it is html page
    function try to detect feed url. It also retrieves icon.
    """
    page = urlopener.open(url)
    icon_url = None
    content_type = None
    if page.info().has_key("Content-Type"):
        content_type = page.info()["Content-Type"]
    if content_type and content_type.find("html") >=0:
        html = BeautifulSoup(page, fromEncoding = get_page_encoding(page))
        base = get_page_base(page, html)
        icon_tag = html.find(name = "link", attrs = {"rel": re.compile("icon", flags = re.I)})
        if icon_tag:
            icon_url = urljoin(base, icon_tag["href"])
        rss_tag = html.find(name = "link", attrs = {"rel": "alternate", "type": "application/rss+xml"})
        if not rss_tag:
            rss_tag = html.find(name = "link", attrs = {"rel": "alternate", "type": "application/atom+xml"})
        if rss_tag:
            url = urljoin(base, rss_tag["href"])
            page = urlopener.open(url)
            content_type = page.info()["Content-Type"]
    xml = BeautifulStoneSoup(page, markupMassage = massage)
    title = get_feed_title(xml)
    icon_tag = xml.find(["icon", "logo", "image"])
    if icon_tag:
        if icon_tag.url:
            icon_url = icon_tag.url.string
        else:
            icon_url = icon_tag.string
    if not icon_url:
        html_url = None
        html_tag = xml.find(name = "link", attrs = {"rel": "alternate"})
        if html_tag:
            html_url = html_tag["href"]
        else:
            html_tag = xml.find("link")
            if html_tag:
                html_url = html_tag.string
        if html_url:
            page = urlopener.open(html_url)
            html = BeautifulSoup(page, fromEncoding = get_page_encoding(page))
            base = get_page_base(page, html)
            icon_tag = html.find(name = "link", attrs = {"rel": re.compile("icon")})
            if icon_tag:
                icon_url = urljoin(base, icon_tag["href"])
    return (url, title, icon_url)


class Item:
    """ This class groups feed element related operations together.

    This is really not related to OOP best practices. It simply handy
    to have all this fields accessible via self.
    """

    page_template_default = '<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" ></head><body></body></html>'
    print_version_keywords_default = [u"версия для печати", u"printable version"]
    cached_comment_prefix = "cached item: "


    def __init__(self, url = None, tag = None, base = None, path = "", cache_path = "", callback = None, \
        page_template = None, print_version_keywords = None, try_use_print_version = True, \
        allow_scripts = False, as_is = False, content_kwargs = None, reformatter_url = None):
        """ It is clear that this is constructor :)

        Item is represented either by its content, when tag is specified, or by reference
        (url is used in this case). 
        base - html document base used to resolve relative references.
        path - base directory used to store cached item
        cache_path - subdirectory of path used to cache related resources (such as images).
        callback - for reporting progress.
        page_template - used for creating cached items.
        print_version_keywords - a list of phrases to scan html document searching for reference to
        printable version.
        try_use_print_version - search for reference to printable version should be performed.
        allow_scripts - do not remove script tags from item before caching.
        as_is - do not try to reformat html document for small screens.
        content_kwargs - it is passed to BeautifulSoup's find method. It is usually
        tag name and its attributes.
        reformatter_url - format string (%s is replaced with url from item).
        Used to process html document through online services such as Skweezer.
        """
        self.__url = url
        self.__tag = tag
        self.__base = base
        self.__path = path
        self.__cache_path = cache_path
        self.__cached = {}
        self.__try_use_print_version = try_use_print_version
        self.__allow_scripts = allow_scripts
        self.__page_template = self.page_template_default
        if page_template:
            self.__page_template = page_template
        self.__print_version_keywords = self.print_version_keywords_default
        if print_version_keywords:
            self.__print_version_keywords = print_version_keywords
        self.__callback = callback
        self.__attrs_for_fix = [("a", "href"), ("iframe", "src"), ("object", "data"), ("form", "action")]
        self.__attrs_for_cache = \
            [("applet", "codebase"), ("object", "codebase"), ("script", "src"), ("img", "src"), ("input", "src")]
        self.title = "untitled"
        self.__as_is = as_is
        self.__content_kwargs = content_kwargs
        self.__reformatter_url = reformatter_url


    def __report(self, action, item):
        if self.__callback:
            self.__callback((action, item))


    def __calc_the_tag(self, tag):
        """ Searches for tag that contains useful content such as article, news etc.

        Here is the primary FeedCircuit heuristic, sorta "know how".
        """
        (textsum, tagsum, max, node) = (0, 0, 0, None)
        for text in tag.findAll(text = lambda text: not isinstance(text, Comment), recursive = False):
            textsum += len(text.strip())
        for sub in tag.findAll(lambda t: not t.name in ["a", "script"], recursive = False):
            (taglen, maxtaglen, maxtag) = self.__calc_the_tag(sub)
            tagsum += taglen
            if maxtaglen > max:
                (max, node) = (maxtaglen, maxtag)
        result = textsum + tagsum / 2
        if result > max and tagsum:
            (max, node) = (result, tag)
        return (result, max, node)


    def __find_the_tag(self, tag):
        return self.__calc_the_tag(tag)[2]


    def __get_cache_directory(self):
        cache_path = os.path.join(self.__path, self.__cache_path)
        if not os.path.exists(cache_path):
            os.mkdir(cache_path)
        return cache_path


    def __find_print_version(self, html, base, referer):
        try:
            text = html.find(text=lambda t: t.strip().lower() in self.__print_version_keywords)
            if text:
                a = text.findParent(lambda t: t.name == "a" and t.has_key("href"))
                if a:
                    url = urljoin(base, a["href"])
                    self.__report("downloading", url)
                    (new_html, new_base, real_url) = get_page_html(url, headers={"Referer": referer})
                    return (new_html, new_base, new_html.body)
        except Exception, ex:
            self.__report("error", ex)
        return (html, base, None)


    def get_url_for_reformatter(self, url):
        if self.__reformatter_url:
            return self.__reformatter_url % urllib.quote(url)
        return url


    def __extract_tag(self):
        """ This performs main Item's job.

        By extracting html document and reformatting it.
        """
        self.__report("downloading", self.__url)
        (html, self.__base, real_url) = get_page_html(self.get_url_for_reformatter(self.__url))
        if not self.__as_is:
            if self.__try_use_print_version:
                (html, self.__base, self.__tag) = self.__find_print_version(html, self.__base, real_url)
            if not self.__reformatter_url:
                if self.__content_kwargs:
                    self.__tag = html.body.find(**self.__content_kwargs)
                if not self.__tag:
                    self.__tag = self.__find_the_tag(html.body)
        if not self.__tag:
            self.__tag = html.body
        title = html.find("title")
        if title:
            self.title = title.string.strip()


    def __fix_url(self, tag, attr, base):
        if tag.has_key(attr):
            tag[attr] = urljoin(base, tag[attr])


    def __fix_urls(self, tags, attr, base):
        [self.__fix_url(tag, attr, base) for tag in tags]


    def __cache_url(self, tag, attr, base):
        if tag.has_key(attr):
            try:
                url = urljoin(base, tag[attr])
                path_for_url = None
                if self.__cached.has_key(url):
                    path_for_url = self.__cached[url]
                else:
                    filename = uuid.uuid1().hex + urlparse(url)[2].split("/")[-1]
                    path_for_url = os.path.join(self.__cache_path, filename)
                    tag.parent.append(Comment(NavigableString(self.cached_comment_prefix + path_for_url)))
                    self.__report("downloading", url)
                    urllib.urlretrieve(quote_url_path(url), os.path.join(self.__get_cache_directory(), filename))
                    self.__cached[url] = path_for_url
                tag[attr] = path_for_url
            except Exception, ex:
                self.__report("error", ex)


    def __cache_urls(self, tags, attr, base):
        [self.__cache_url(tag, attr, base) for tag in tags]


    def __fixup_tag(self):
        """ Collection of misc postprocessing tasks.

        1. Removing scripts.
        2. Caching resources and changing references to them.
        3. Keeping consistency by preventing extracting tags from context,
        e.g. td should always be nested into tr etc.
        """
        if not self.__allow_scripts:
            [script.extract() for script in self.__tag.findAll("script")]
        [self.__fix_urls(self.__tag.findAll(x[0]), x[1], self.__base) for x in self.__attrs_for_fix]
        [self.__cache_urls(self.__tag.findAll(x[0]), x[1], self.__base) for x in self.__attrs_for_cache]
        if self.__tag.name in ["table", "dir", "menu", "dl", "ul", "ol"]:
            return [self.__tag]
        elif self.__tag.name in ["tr", "caption", "colgroup", "thead", "tfoot", "tbody"]:
            return [self.__tag.findParent("table")]
        return [node for node in self.__tag.contents]


    def save(self, title = None, filename = None):
        """ Saves item into cache. """
        html = BeautifulSoup(self.__page_template)
        if not title:
            title = self.title
        get_or_create_tag(html, ["head", "title"]).append(NavigableString(title))
        body = get_or_create_tag(html, ["body"])
        [body.append(tag) for tag in self.__contents]
        if not filename:
            filename = uuid.uuid1().hex + ".html"
        f = open(os.path.join(self.__path, filename), "w")
        f.write(html.prettify())
        f.close()
        return filename


    def process(self):
        """ Entry point. """
        self.__contents = []
        try:
            if self.__url:
                self.__extract_tag()
            self.__contents = self.__fixup_tag()
        except Exception, ex:
            self.__report("error", ex)
            self.delete()
        return self.__contents


    def __delete_cached_item(self, filename):
        if os.path.exists(filename):
            if filename.endswith(".html"):
                f = open(filename, "r")
                html = BeautifulSoup(f)
                f.close()
                self.__delete_cached_items(html.body, os.path.dirname(filename))
            self.__report("deleting", filename)
            os.remove(filename)


    def __delete_cached_items(self, tag, path):
        items_to_delete = tag.findAll(text = lambda text: \
            isinstance(text, Comment) and text.startswith(self.cached_comment_prefix))
        [self.__delete_cached_item(os.path.join(path, item[len(self.cached_comment_prefix):])) \
            for item in items_to_delete]


    def delete(self, filename = None):
        """ Deletes item and all related resources. """
        if filename:
            self.__delete_cached_item(os.path.join(self.__path, filename))
        elif self.__tag:
            self.__delete_cached_items(self.__tag, self.__path)


class Feed:
    """ Collection of feed related operations. """
    
    page_item_template_default = '<b><a/></b><div></div><br>'
    item_comment_prefix = "feed item: "

    def __init__(self, url, path = "", cache_path = "", callback = None, title = None, filename = None,
        page_template = None, page_item_template = None, print_version_keywords = None, \
        try_use_print_version = True, delete_before_date = None, allow_scripts = False, \
        inline_items = False, cache_items = False, include = None, exclude = None, as_is = False,
        content_kwargs = None, reformatter_url = None):
        """ Constructor.

        url - feed url.
        path - base directory used to store cached item
        cache_path - subdirectory of path used to cache related resources (such as images).
        callback - for reporting progress.
        title - if this is not provided it is tried to figure out automatically.
        filename - if this is not provided title.html is used to store feed
        page_template - used for creating cached items.
        page_item_template - html used to construct feed item.
        print_version_keywords - a list of phrases to scan html document searching for reference to
        printable version.
        try_use_print_version - search for reference to printable version should be performed.
        delete_before_date - all items before this date are deleted during feed update.
        allow_scripts - do not remove script tags from item before caching.
        inline_items - inline referenced resources into feed html.
        cache_items - download and cache referenced resources.
        include - regexp for item urls to include.
        exclude - regexp for item urls to exclude.
        as_is - do not try to reformat html document for small screens.
        content_kwargs - it is passed to BeautifulSoup's find method. It is usually
        tag name and its attributes.
        reformatter_url - format string (%s is replaced with url from item).
        Used to process html document through online services such as Skweezer.
        """
        self.__url = url
        self.__path = path
        self.__cache_path = cache_path
        self.__callback = callback
        self.__title = title
        self.__filename = filename
        self.__page_template = Item.page_template_default
        if page_template:
            self.__page_template = page_template
        self.__page_item_template = self.page_item_template_default
        if page_item_template:
            self.__page_item_template = page_item_template
        self.__print_version_keywords = print_version_keywords
        self.__try_use_print_version = try_use_print_version
        self.__inline_items = inline_items
        self.__cache_items = cache_items
        self.__delete_before_date = delete_before_date
        self.__allow_scripts = allow_scripts
        self.__include = None
        if include:
            self.__include = re.compile(include)
        self.__exclude = None
        if exclude:
            self.__exclude = re.compile(exclude)
        self.__as_is = as_is
        self.__content_kwargs = content_kwargs
        self.__reformatter_url = reformatter_url


    def __report(self, action, item):
        if self.__callback:
            self.__callback((action, item))


    def __unquote_special(self, str):
        if str:
            return str.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"').replace("&amp;", "&")
        return str


    def __get_item_link(self, item):
        if item.guid:
            if not item.guid.has_key("ispermalink") or item.guid["ispermalink"].lower() == "true":
                return item.guid.string
        if item.link:
            if item.link.has_key("href"):
                return item.link["href"] #Atom uses href to store links
            else:
                return item.link.string


    def __get_item_date(self, item):
        tag = item.find(["pubdate", "published"])
        if tag:
            try:
                (date, zone) = parse_date(tag.string)
                return date
            except Exception:
                self.__report("warning", "failed to parse " + tag.string)
        return None


    def __get_item_description(self, item):
        desc = item.find(["description", "content"])
        if desc:
            str = desc.string
            if str:
                if not isinstance(str, CData):
                    str = self.__unquote_special(str)
                return BeautifulSoup(str)
        return BeautifulSoup()


    def __item_exists(self, tag, link):
        return (self.item_comment_prefix + link) in self._existing_items


    def __outdated(self, item):
        date = self.__get_item_date(item)
        if date and self.__delete_before_date:
            return date.date() < self.__delete_before_date
        return False


    def __match_link(self, link):
        """ Filter out link using include and exclude patterns. """
        if self.__include and not self.__include.search(link):
            return False
        if self.__exclude and self.__exclude.search(link):
            return False
        return True


    def __create_feed_item(self, url = None, tag = None, base = None, path = None, cache_path = None):
        """ This is just not to type all this code when you need new Item instance. """
        if path == None:
            path = self.__path
        if cache_path == None:
            cache_path = self.__cache_path
        return Item(url = url, tag = tag, base = base, path = path, cache_path = cache_path, \
            callback = self.__callback, try_use_print_version = self.__try_use_print_version, \
            print_version_keywords = self.__print_version_keywords, allow_scripts = self.__allow_scripts, \
            as_is = self.__as_is, content_kwargs = self.__content_kwargs, reformatter_url = self.__reformatter_url)
        

    def __add_item(self, div, item):
        """ Here Item is get included to feed after executing all dependant operations. """
        link = self.__get_item_link(item)
        if self.__match_link(link) and not self.__outdated(item) and not self.__item_exists(div.parent, link):
            title = self.__unquote_special(item.title.string.strip())
            self.__report("processing", title)
            item_html = BeautifulSoup(self.__page_item_template)
            div.append(Comment(NavigableString(self.item_comment_prefix + link)))
            div.append(item_html)
            item_a = get_or_create_tag(item_html, ["a"])
            item_a["href"] = link
            item_a.append(NavigableString(title))
            item_div = get_or_create_tag(item_html, ["div"])
            if self.__inline_items:
                feed_item = self.__create_feed_item(url = link)
                [item_div.append(t) for t in feed_item.process()]
            else:
                feed_item = self.__create_feed_item(tag = self.__get_item_description(item), base = self._real_url)
                [item_div.append(t) for t in feed_item.process()]
                if self.__cache_items:
                    path = os.path.join(self.__path, self.__cache_path)
                    feed_item = self.__create_feed_item(url = link, path = path, cache_path = "")
                    feed_item.process()
                    item_a["href"] = os.path.join(self.__cache_path, feed_item.save(title))
                    item_div.insert(0, Comment(NavigableString(Item.cached_comment_prefix + item_a["href"])))


    def __delete_div(self, div):
        feed_item = self.__create_feed_item(tag = div)
        feed_item.delete()
        div.extract()


    def __delete_before(self, html):
        if self.__delete_before_date:
            divs = filter(lambda div: \
                div.has_key("id") and datetime.strptime(div["id"], "%Y%m%d%H%M%S").date() < self.__delete_before_date, \
                html.body.findAll("div", recursive = False))
            [self.__delete_div(div) for div in divs]


    def __get_new_item_count(self, tag):
        return len(tag.findAll(text = lambda text: isinstance(text, Comment)
            and text.startswith(self.item_comment_prefix)))


    def update(self):
        """ Updates feed and returns numbers of new items.

        When there are now new items feed file stays untouched.
        Outdated items are deleted only when there are some new ones.
        """
        result = 0
        try:
            self.__report("downloading", self.__url)
            page = urlopener.open(quote_url_path(self.__url))
            self._real_url = page.geturl()
            xml = BeautifulStoneSoup(page, markupMassage = massage)
            if not self.__title:
                self.__title = get_feed_title(xml)
            path = os.path.join(self.__path, self.__title + ".html")
            if self.__filename:
                path = os.path.join(self.__path, self.__filename)
            if os.path.exists(path):
                f = open(path, "r")
                html = BeautifulSoup(f)
                f.close()
            else:
                html = BeautifulSoup(self.__page_template)
                get_or_create_tag(html, ["head", "title"]).append(NavigableString(self.__title))
            self._existing_items = html.findAll(text = lambda text: \
                isinstance(text, Comment) and text.startswith(self.item_comment_prefix))
            div = Tag(html, "div")
            div["id"] = datetime.today().strftime("%Y%m%d%H%M%S")
            div.append(NavigableString(unicode(datetime.today().strftime("%c").decode(locale.getpreferredencoding()))))
            div.append(Tag(html, "hr"))
            get_or_create_tag(html, ["body"]).insert(0, div)
            [self.__add_item(div, tag) for tag in xml.findAll(["item", "entry"])]
            result = self.__get_new_item_count(div)
            if result:
                self.__delete_before(html)
                f = open(path, "w")
                f.write(html.prettify())
                f.close()
        except Exception, ex:
            self.__report("error", ex)
            self.__delete_div(div)
        return result


    def delete(self):
        """ Deletes everything related to feed from cache. """
        if self.__filename:
            path = os.path.join(self.__path, self.__filename)
        else:
            path = os.path.join(self.__path, self.__title + ".html")
        if os.path.exists(path):
            f = open(path, "r")
            html = BeautifulSoup(f)
            f.close()
            if html.body:
                for div in html.body.findAll(name = "div", recursive = False):
                    self.__delete_div(div)
            os.remove(path)
