#!/usr/bin/python import re,urllib2 import cPickle as pickle import BeautifulSoup URL_FORMAT = "http://www.tutiempo.net/en/Climate/Londres_Heathrow_Airport/{month}-{year}/37720.htm" YEARS = range(1949, 1958) + range(1974, 2012) FILENAME = 'heathrow.dat' def UrlForMonth( month, year ): m = "%02d" % month y = "%04d" % year return URL_FORMAT.replace( '{month}', m ).replace( '{year}', y ) def HtmlForMonth( month, year ): return urllib2.urlopen( UrlForMonth( month, year ) ).read() def Ensoupen( month, year ): return BeautifulSoup.BeautifulSoup( HtmlForMonth( month, year ) ) def FindClimateData( soup ): heading = soup.find(text=re.compile('Historical Weather \(Climate\)')).parent.parent.parent.parent # skip till we find the row with a table in it row = heading while not row('table'): row = row.findNextSibling('tr') climateTable = row('table')[0] row = climateTable.tr # skip rows until we find one with "1" in the first column while not row.find( text='1' ): row = row.findNextSibling('tr') return row def ParseClimateForDay( row ): def optional( val ): if val == u'-': return None return float( val ) def event( val ): return (val == u'o') info = {} cols = [td.contents[0] for td in row('td')] info['day'] = int( cols[0].string ) info['temp'] = optional( cols[1] ) info['tmax'] = optional( cols[2] ) info['tmin'] = optional( cols[3] ) info['precip'] = optional( cols[6] ) info['rain'] = event( cols[11] ) info['snow'] = event( cols[12] ) info['thndr'] = event( cols[13] ) info['fog'] = event( cols[14] ) return info def ParseClimateTable( firstRow ): month = [] row = firstRow while row('b'): month.append( ParseClimateForDay( row ) ) row = row.findNextSibling( 'tr' ) return month def BuildYearTable( year ): print 'Scraping for',year, info = [] for month in range(1,13): print '.', soup = Ensoupen( month, year ) climateData = FindClimateData( soup ) parsed = ParseClimateTable( climateData ) info.append( parsed ) print 'done' return info def main(): climate = {} for year in YEARS: climate[year] = BuildYearTable( year ) f = file(FILENAME, 'w') pickle.dump( climate, f ) f.close() if __name__ == '__main__': main()