forked from fgregg/scraping-intro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrab_tax_list.py
79 lines (65 loc) · 2.07 KB
/
grab_tax_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import urllib2
from BeautifulSoup import BeautifulSoup
# Our url
address ='https://www.revenue.state.il.us/app/kob/KOBReport?r=Specific&p=20121&m=0160001'
# Read in the response to a HTTP request
response = urllib2.urlopen(address).read()
# BeautifulSoup is a very nice library for parsing HTML and walking
# through an HTML document hierarchy
soup = BeautifulSoup(response)
## We want to grab all the td elements like the one below
##
## <TD CLASS="data" ALIGN="right" VALIGN="top" NOWRAP>
## <CENTER><B>ST</B></CENTER>
## 18,887,148.64<BR>
## 16,185,354.22<BR>
## 62,117,172.52<BR>
## 17,352,693.36<BR>
## 14,166,524.92<BR>
## 7,891,375.77<BR>
## 32,494,883.62<BR>
## 29,752,400.16<BR>
## 20,989,444.21<BR>
## 4,220,617.81<BR>
## 224,057,615.23<BR><BR></TD>
taxes = soup.fetch('td', {'align':'right', 'valign':'top'})
tax_list = []
for tax in taxes :
# The element corresponding to the name of the tax is the second
# child of the TD element (Python indexes start with 0) It's not the
# first element because BeautifulSoup considers newlines, '\n', and
# there is a newline before <CENTER>
#
# <CENTER><B>ST</B></CENTER>
#
# in BeautifulSoup the .text method returns all the text enclosed by
# start and closing tags
tax_name = tax.contents[1].text
# We turn the BeautifulSoup element into a simple string that looks
# like
# <td class="data" align="right" valign="top" nowrap="NOWRAP">
# <center><b>RTA</b></center>
# <br />
# <br />
# <br />
# <br />
# <br />
# <br />
# <br />
# 1,587.98<br />
# 895.65<br />
# 118.50<br />
# 4,151.20<br /><br /></td>
tax_rows = str(tax)
# remove the non-informative strings from the tax_rows string
tax_rows = tax_rows.replace('<br />', '')
tax_rows = tax_rows.replace('</td>', '')
tax_rows = tax_rows.replace(',', '')
#print tax_rows
# split the string into a list at the newlines
tax_rows = tax_rows.split('\n')
# the first two elements correspond to the first td tag
# and the tax name element which we already got
amounts = tax_rows[2:]
tax_list.append((tax_name, amounts))
print tax_list