#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys, os, urllib2, htmllib, formatter

class Formatter(formatter.NullFormatter):
  pass

class Parser(htmllib.HTMLParser):
  def __init__(self, format, table, key, vn):
    htmllib.HTMLParser.__init__(self, format)
    self.header = None
    self.entry = None
    self.item = None
    self.rowspan = None
    self.table = table
    self.key = key
    self.vn = vn
    self.href = None
    self.ign = 0
  def start_table(self, attrs):
    self.header = None
  def start_tr(self, attrs):
    if self.rowspan is not None:
      self.rowspan -= 1
      if self.rowspan > 0:
        self.entry = None
        return
    self.entry = []
  def end_tr(self):
    if self.entry is None:
      return
    if self.header is None:
      self.header = self.entry
      #print "Header:", self.header
    elif self.header[0] == self.key:
      self.table.append(self.entry)
    self.entry = None
  def start_td(self, attrs):
    if self.entry is None:
      return
    self.item = ""
    self.href = None
    for key, value in attrs:
      if key == "rowspan" and len(self.entry) == 0:
        self.rowspan = int(value)
  def end_td(self):
    if self.entry is None:
      return
    self.entry.append(self.item)
    if len(self.entry)-1 == self.vn:
      self.entry.append(self.href)
    self.item = None
    self.href = None
  def start_th(self, attrs):
    self.start_td(attrs)
  def end_th(self):
    self.end_td()
  def start_a(self, attrs):
    if self.ign > 0:
      return
    if self.href is not None:
      return
    for key, value in attrs:
      if key == "href":
        self.href = value
  def end_a(self):
    pass
  def start_sup(self, attrs):
    self.ign += 1
  def end_sup(self):
    self.ign -= 1
  def handle_data(self, data):
    if self.ign > 0:
      return
    if self.item is not None:
      self.item += data

formatter = Formatter()
mcclist = []
pfxlist = []

doc = urllib2.urlopen("http://en.wikipedia.org/wiki/List_of_mobile_country_codes")
#doc = urllib2.urlopen("file:List_of_mobile_country_codes")
parser = Parser(formatter, mcclist, "Code (MCC)", 2)
data = doc.read()
while data != "":
  parser.feed(data)
  data = doc.read()
doc.close()
parser.close()

doc = urllib2.urlopen("http://en.wikipedia.org/wiki/International_mobile_phone_codes")
#doc = urllib2.urlopen("file:International_mobile_phone_codes")
parser = Parser(formatter, pfxlist, "Country Name", 0)
data = doc.read()
while data != "":
  parser.feed(data)
  data = doc.read()
doc.close()
parser.close()

# some not-yet-updated wikipedia links...
canon_map = {
  "Ireland": "Republic_of_Ireland",
  "Ivory_Coast": "C%C3%B4te_d%27Ivoire",
  "Democratic_People%27s_Republic_of_Korea": "North_Korea",
  "Republic_of_Korea": "South_Korea",
  "Kyrgyz_Republic": "Kyrgyzstan",
  "Lao_People%27s_Democratic_Republic": "Laos",
  "Palestinian_Authority": "Palestinian_territories",
  "Per%C3%BA": "Peru",
  "Republic_of_Congo": "Republic_of_the_Congo",
  "Democratic_Republic_of_Congo": "Democratic_Republic_of_the_Congo",
  "Syrian_Arab_Republic": "Syria",
  "United_States": "United_States_of_America",
}

def get_canon(name):
  if name[:6] != "/wiki/":
    return name
  wname = name[6:]
  return "/wiki/" + canon_map.get(wname, wname)

# Merge prefixes
pfxtab = {}
for line in pfxlist:
  name = line[0]
  wname = get_canon(line[1])
  prefix = line[2]
  sp = prefix.find(" ")
  if sp != -1:
    # for this list, ignore area codes
    prefix = prefix[:sp]
  wnames = pfxtab.setdefault(prefix, [])
  if wname not in wnames:
    wnames.append(wname)

pfxkey = pfxtab.keys()
pfxkey.sort()

# Merge MCCs
mcctab = {}
for line in mcclist:
  mcc = line[0]
  code = line[1]
  name = line[2]
  wname = get_canon(line[3])
  cm = name.find(" (")
  if cm != -1:
    # ignore comments in name
    name = name[:cm]
  mcctab.setdefault(wname, [code, name]).append(mcc)

canon_pfx = {
  "+1": ("US_CA", "North America"),
  "+7": ("RU", "Russia"),
}

out = open("regions.xml", "w")
out.write("<regions>\n")
for prefix in pfxkey:
  wnames = pfxtab[prefix]
  code = None
  name = None
  if canon_pfx.has_key(prefix):
    code, name = canon_pfx[prefix]
  mccs = []
  for wname in wnames:
    if not mcctab.has_key(wname):
      print "Country not found:", wname
      continue
    data = mcctab[wname]
    if code is None:
      code = data[0]
      name = data[1]
    mccs.extend(data[2:])
  mccs.sort()
  out.write("  <region>\n")
  out.write("    <prefix>%s</prefix>\n" % prefix)
  out.write("    <code>%s</code>\n" % code.lower())
  out.write("    <name>%s</name>\n" % name)
  for mcc in mccs:
    out.write("    <mcc>%s</mcc>\n" % mcc)
  out.write("  </region>\n")
out.write("</regions>\n")
