#!/usr/bin/env python

# -*- coding: utf-8 -*-

##
#   Viito -- an image dictionary for sign language
#
#   Copyright (C) 2010  Aki Niemi <aki.niemi@iki.fi>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.

#
# This script web scrapes the papunet.net site for images and metadata
#

from BeautifulSoup import BeautifulSoup
from urllib import urlopen
from hashlib import md5
import re

papunet_root = u'http://papunet.net/yleis/'
materials = u'materiaalit/kuvapankki/'
data_path = u''
image_path = u''

word_re = re.compile('(?<=<b>")[\w ]+', re.U)
author_re = re.compile('(?<=kuva: )[\w ]+', re.U)

def is_sign(path):
    return path.find('puistolinna') > 0 or path.find('viittomat') > 0

def extract_metadata(s):
    """Extras the metadata for an image"""
    try:
        word = word_re.search(s).group().strip()
    except AttributeError:
        raise Exception("image has no associated word")

    try:
        author = author_re.search(s).group().strip()
    except AttributeError:
        author = ""

    return (word, author)

def fetch_and_save_image(path):
    """Fetch all image data"""
    image = urlopen(path).read()
    h = md5()
    h.update(image)
    name = h.hexdigest()
    f = open(image_path + name, 'w')
    f.write(image)
    f.close()
    return name

def extract(page):
    """Extract images and metadata"""

    images = []

    elements = [i.find('a') for i in page.findAll('span', attrs={'class' : 'photo'})]
    elements = [e for e in elements if is_sign(e['href'])]

    for e in elements:
        try:
            word, author = extract_metadata(e['onclick'])
        except err:
            print "Error processing %s: %s" % (e, str(err))
            continue

        name = fetch_and_save_image(papunet_root + e['href'])
        words = [s.strip() for s in word.split(',')]

        for item in words:
            images.append({ u'word' : item, u'author' : author, u'image' : u'images/' + name })

    return images

def parse(doc):
    """Parse a page, including possible sub-pages"""
    images = []

    # Extract images from root page
    images.extend(extract(doc))

    # Check for possible sub-pages
    nav = doc.find('div', attrs = { 'id' : 'kuvapankki_nav' })
    if not nav:
        return images

    items = [a for a in nav.findAll('a')]
    if len(items) == 0:
        return images

    for item in items:
        subdoc = fetch_doc(papunet_root + item['href'])
        images.extend(extract(subdoc))

    return images

def recurse_parse(subtopics):
    """Recurse into links on left panel"""
    images = []

    for topic in subtopics:

        doc = fetch_doc(papunet_root + topic['href'])
        pagename = doc.find('h1', attrs = { 'class' : 'csc-firstHeader' }).contents[0]

        print "  Scraping sub-topic: %s" % pagename

        images.extend(parse(doc))

    return images

def fetch_doc(path):
    """Fetches the HTML and returns as soup"""
    return BeautifulSoup(urlopen(path).read())

def scrape(doc):
    """Recursively scrape the data from root document"""

    count = 0
    data = {}
    topics = [li.find('a') for li in doc.findAll('ul')[3].findAll('li')]

    for topic in topics:

        images = []

        doc = fetch_doc(papunet_root + topic['href'])
        pagename = doc.find('h1', attrs = { 'class' : 'csc-firstHeader' }).contents[0]

        print "Scraping topic: %s" % pagename

        # Each topic has one or more pages. A topic can be divided
        # into sub-topics (listed on the left navigation panel), and
        # each page can have one or more sub-pages (linked from the
        # root page).

        # Find out whether topic has sub-topics
        try:
            subdocs = [li.find('a') for li in doc.findAll('ul')[4].findAll('li')]
        except IndexError:
            images = parse(doc)
        else:
            images = recurse_parse(subdocs)

        num = len(images)
        count += num

        if num > 0:
            data[pagename] = images

        print "Total: %d images and metadata" % num

    return data, count

def data_to_xml(data):
    """Create a resource file from data"""

    outstr = u'<?xml version="1.0" encoding="UTF-8"?>\n\n'
    outstr += u'<root lang="fi" url="' + papunet_root + '">\n'

    for (topic, images) in data.items():

        outstr += u'  <topic name="%s">\n' % topic

        for i in images:
            outstr += u'    <image name="%s" author="%s" file="%s" />\n' \
                % (i['word'], i['author'], i['image'])

        outstr += u'  </topic>\n'

    outstr += u'</root>\n'
    return outstr

if __name__ == "__main__":
    import codecs, os, sys

    doc = fetch_doc(papunet_root + materials)

    if len(sys.argv) < 2:
        print "Data path missing!"
        print "Usage: %s <output_path>" % sys.argv[0]
        exit(1)

    data_path = sys.argv[1] + '/'
    image_path = data_path + u'images/'

    try:
        os.makedirs(data_path)
        os.makedirs(image_path)
    except OSError:
        pass

    (data, count) = scrape(doc)
    xml = data_to_xml(data)

    f = codecs.open(data_path + 'images.xml', mode='w', encoding='utf-8')
    f.write(xml)
    f.close()

    print "\nWrote %simages.xml with %d images\n" % (data_path, count)
