forked from fgregg/scraping-intro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcomplete_page.py
58 lines (41 loc) · 1.54 KB
/
complete_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import urllib2
from BeautifulSoup import BeautifulSoup
# Our url
address ='https://www.revenue.state.il.us/app/kob/KOBReport?r=Specific&p=20121&m=0160001'
# Read in the response to a HTTP request
response = urllib2.urlopen(address).read()
# BeautifulSoup is a very nice library for parsing HTML and walking
# through an HTML document hierarchy
soup = BeautifulSoup(response)
tables = soup.fetch('table', {'cellspacing':'3'})
# the first table is the form
tax_info = []
for table in tables[1:] :
taxes = table.fetch('td', {'align':'right', 'valign':'top'})
tax_list = []
for tax in taxes :
tax_name = tax.contents[1].text
tax_rows = str(tax)
tax_rows = tax_rows.replace('<br />', '')
tax_rows = tax_rows.replace('</td>', '')
tax_rows = tax_rows.replace(',', '')
tax_rows = tax_rows.split('\n')
amounts = tax_rows[2:]
tax_list.append((tax_name, amounts))
heading = table.fetch('table', {'width':'600'})[0]
headers = []
for header in heading.fetch('td') :
header_text = header.text
if 'Number of Taxpayers' in header_text :
num_tax_payers = header_text.split(':')
num_tax_payers = num_tax_payers[1]
num_tax_payers = num_tax_payers.strip()
num_tax_payers = num_tax_payers.replace(',','')
headers.append(num_tax_payers)
else:
headers.append(header_text)
tax_info.append((headers, tax_list))
for county in tax_info :
print county[0]
for tax in county[1] :
print '\t', tax