from sgmllib import SGMLParser
import os, re, time, urllib, random, sys, math

from urllib import quote


class AppURLopener(urllib.FancyURLopener):
    def __init__(self, *args):
        self.version = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)"
        urllib.FancyURLopener.__init__(self, *args)


urllib._urlopener = AppURLopener()

url_quote = quote


def hitCount(query):

    query = (
        'http://www.altavista.com/web/results?lang=en&aqb=&sc=on&aqa=%s&aqmode=s&lh=&dt=tmperiod&avkw=aapt&aqs=&rc=dmn&nbq=10&pg=aq&d2=0'
        % url_quote(query)
    )

    sock = urllib.urlopen(query)
    html = sock.read().lower()

    tag = 'altavista found'
    p = html.find(tag)
    if p < 0:
        tag = 'of <b>'
        p = html.find(tag)
        if p < 0:
            return 0
    html = html[p + len(tag) :]

    p = html.find('result')
    html = html[:p]

    html = html.replace('.', '')
    html = html.replace(',', '')

    try:
        return int(html)
    except:
        return 0


def countryList():
    return [
        ["Andorra", 507368],
        ["United Arab Emirates", 485675],
        ["Afghanistan", 1659914],
        ["Antigua and Barbuda", 150533],
        ["Anguilla", 357580],
        ["Albania", 562171],
        ["Armenia", 546460],
        ["Netherlands Antilles", 244385],
        ["Angola", 539872],
        ["Antarctica", 612907],
        ["Argentina", 4614932],
        ["American Samoa", 203345],
        ["Austria", 4369057],
        ["Australia", 14428914],
        ["Anguila", 5840],
        ["Aruba", 570714],
        ["Azerbaijan", 479812],
        ["Bosnia", 769785],
        ["Barbados", 827197],
        ["Bangladesh", 950767],
        ["Belgium", 3853098],
        ["Burkina Faso", 353621],
        ["Bulgaria", 1341424],
        ["Bahrain", 496097],
        ["Burundi", 365045],
        ["Benin", 376006],
        ["Bermuda", 911995],
        ["Brunei", 525944],
        ["Bolivia", 1282330],
        ["Brazil", 3353612],
        ["Bahamas", 1266815],
        ["Bhutan", 313043],
        ["Bouvet Island", 23080],
        ["Botswana", 497482],
        ["Belarus", 915687],
        ["Belize", 638180],
        ["Canada", 22947345],
        ["Congo", 743767],
        ["Central African Republic", 99158],
        ["Congo Brazzavile", 129],
        ["Switzerland", 4464939],
        ["Ivory Coast", 159785],
        ["Cook Islands", 141587],
        ["Chile", 3273012],
        ["Cameroon", 365883],
        ["China", 10503149],
        ["Colombia", 2391704],
        ["Costa Rica", 2156026],
        ["Cuba", 2558546],
        ["Cape Verde", 149038],
        ["Christmas Island", 89484],
        ["Cyprus", 1206499],
        ["Czech Republic", 1661565],
        ["Germany", 13011450],
        ["Djibouti", 228802],
        ["Denmark", 2821104],
        ["Dominica", 442528],
        ["Dominican Republic", 769942],
        ["Algeria", 708433],
        ["Ecuador", 1503548],
        ["Estonia", 1104363],
        ["Egypt", 2919818],
        ["Western Sahara", 67282],
        ["Eritrea", 326046],
        ["Spain", 6262644],
        ["Ethiopia", 572384],
        ["Finland", 3029327],
        ["Fiji", 717135],
        ["Falkland Islands", 96205],
        ["Micronesia", 240281],
        ["Faroe Islands", 104229],
        ["France", 17079307],
        ["Gabon", 261195],
        ["Great Britain (UK*)", 49484],
        ["Grenada", 635651],
        ["Georgia", 6911046],
        ["French Guiana", 92279],
        ["Ghana", 741162],
        ["Gibraltar", 586670],
        ["Greenland", 457377],
        ["Gambia", 316286],
        ["Guinea", 937213],
        ["Guadeloupe", 390825],
        ["Equatorial Guinea", 117821],
        ["Greece", 3314249],
        ["Guatemala", 1252271],
        ["Guam", 505723],
        ["Guinea", 937213],
        ["Guyana", 361045],
        ["Hong Kong", 4372714],
        ["Honduras", 932766],
        ["Croatia", 901307],
        ["Haiti", 668931],
        ["Hungary", 2009608],
        ["Indonesia", 2669256],
        ["Ireland", 6484026],
        ["Israel", 6124467],
        ["India", 8285435],
        ["Iraq", 3962392],
        ["Iran", 1898788],
        ["Iceland", 927557],
        ["Italy", 8194116],
        ["Jamaica", 1503733],
        ["Jordan", 3596335],
        ["Japan", 11964871],
        ["Kenya", 1329593],
        ["Kyrgyzstan", 286245],
        ["Cambodia", 874170],
        ["Comoros", 128402],
        ["North Korea", 480082],
        ["South Korea", 861535],
        ["Kuwait", 902961],
        ["Kazakhstan", 545513],
        ["Laos", 683111],
        ["Lebanon", 1611341],
        ["Saint Lucia", 213232],
        ["Liechtenstein", 476879],
        ["Sri Lanka", 1255410],
        ["Liberia", 433926],
        ["Lesotho", 261276],
        ["Lithuania", 1010669],
        ["Luxembourg", 1509947],
        ["Latvia", 993776],
        ["Libya", 379545],
        ["Morocco", 875836],
        ["Monaco", 1211341],
        ["Moldova", 483230],
        ["Madagascar", 456664],
        ["Marshall Islands", 114943],
        ["Macedonia", 588851],
        ["Mali", 618361],
        ["Myanmar", 612176],
        ["Mongolia", 418530],
        ["Macau", 454230],
        ["Martinique", 408059],
        ["Mauritania", 234854],
        ["Montserrat", 272426],
        ["Malta", 1446525],
        ["Mauritius", 619848],
        ["Maldives", 399717],
        ["Malawi", 411174],
        ["Mexico", 12860749],
        ["Malaysia", 2896572],
        ["Mozambique", 445068],
        ["Namibia", 561881],
        ["New Caledonia", 196439],
        ["Niger", 379119],
        ["Norfolk Island", 75537],
        ["Nigeria", 1065105],
        ["Nicaragua", 942399],
        ["Netherlands", 4278864],
        ["Norway", 3049784],
        ["Nepal", 1255808],
        ["Nauru", 139532],
        ["Neutral Zone", 27949],
        ["New Zealand", 5220176],
        ["Oman", 538416],
        ["Panama", 1959774],
        ["Peru", 2534387],
        ["French Polynesia", 250512],
        ["Papua New Guinea", 357961],
        ["Philippines", 2321785],
        ["Pakistan", 2180449],
        ["Poland", 2836355],
        ["Puerto Rico", 2244142],
        ["Portugal", 4164322],
        ["Paraguay", 1056969],
        ["Qatar", 435277],
        ["Reunion", 1958013],
        ["Romania", 1626421],
        ["Russia", 5470901],
        ["Rwanda", 426131],
        ["Saudi Arabia", 1006404],
        ["Seychelles", 315412],
        ["Sudan", 636461],
        ["Sweden", 4009711],
        ["Singapore", 3994066],
        ["St. Helena", 175232],
        ["Slovenia", 974837],
        ["Slovak Republic", 189661],
        ["Sierra Leone", 368211],
        ["San Marino", 384320],
        ["Senegal", 740999],
        ["Somalia", 409418],
        ["Suriname", 236361],
        ["Sao Tome and Principe", 48854],
        ["El Salvador", 850296],
        ["Syria", 898835],
        ["Swaziland", 259978],
        ["Chad", 1250832],
        ["Togo", 333450],
        ["Thailand", 3746028],
        ["Tajikistan", 200424],
        ["Turkmenistan", 336623],
        ["Tunisia", 628396],
        ["Tonga", 351343],
        ["East Timor", 227194],
        ["Turkey", 3330430],
        ["Trinidad and Tobago", 237116],
        ["Tuvalu", 143394],
        ["Taiwan", 2969225],
        ["Tanzania", 703178],
        ["Ukraine", 1860363],
        ["Uganda", 676270],
        ["United Kingdom", 5432441],
        ["United States", 22451899],
        ["Uruguay", 1552015],
        ["Uzbekistan", 496764],
        ["Vatican City", 102054],
        ["Saint Vincent & the Grenadines", 9584],
        ["Venezuela", 2375470],
        ["Virgin Islands (British)", 21933],
        ["Virgin Islands", 769543],
        ["Vietnam", 3024465],
        ["Vanuatu", 279065],
        ["Wallis and Futuna Islands", 8060],
        ["Samoa", 471357],
        ["Yemen", 431068],
        ["Mayotte", 96884],
        ["Yugoslavia", 862796],
        ["South Africa", 3073302],
        ["Zambia", 497879],
        ["Zimbabwe", 995324],
    ]


def saveMatrix(matrix, countries, weights, size, keyw):
    f = open('res_%s.txt' % keyw, 'w')
    f.write(','.join(countries[:size]) + '\n')
    f.write(','.join(weights[:size]) + '\n')
    for y in range(size):
        f.write(','.join(["%2.8f" % val for val in matrix[y][:size]]) + '\n')
    f.close()


if 1:
    if len(sys.argv) > 1:
        keyw = sys.argv[1]
    else:
        keyw = ''

    lst = countryList()
    if keyw != '':
        print 'getting counts for', keyw
        for c in lst:
            country = c[0]
            print 'counting ', country
            c[1] = hitCount('"%s" NEAR "%s"' % (country, keyw))
    file('scores_' + keyw, 'w').write(` lst `)
    lst.sort(lambda l1, l2: cmp(l2[1], l1[1]))
    matrix = [[0.0] * len(lst) for i in lst]
    countries = [country.strip() for country, score in lst]
    weights = [str(score) for country, score in lst]
    rows = 0
    for countryfrom, scorefrom in lst:
        scores = []
        cols = 0
        for countryto, scoreto in lst[:rows]:
            print '*',
            searchfor = '"%s" NEAR "%s"' % (countryfrom, countryto)
            if keyw != '':
                searchfor + ' NEAR "%s"' % keyw
            hits = None
            while hits == None:
                try:
                    hits = hitCount(searchfor)
                except:
                    hits = None
            score = hits / math.sqrt(scoreto * scorefrom)
            matrix[cols][rows] = score
            matrix[rows][cols] = score
            cols += 1
        rows += 1
        saveMatrix(matrix, countries, weights, rows, keyw)
        print ''
        print countryfrom, 'ready'
