-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpoverty.py
103 lines (84 loc) · 3.36 KB
/
poverty.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Gets the NYCgov poverty rate at the community district level
import urllib2
import urllister
import re
import pandas as pd
import ast
###
# Gets the NYCgov Poverty Rate data from NYCO website
###
def getNYCGovPoverty():
# read the data from the nycopp site
jsfile = "https://www1.nyc.gov/assets/opportunity/js/agencies/cd_data.js"
con = urllib2.urlopen(jsfile)
nycgov_str = con.read()
con.close()
# replace strings
nycgov_str = re.sub("cd_data\[[0-9]*\]\ =\ ", "", nycgov_str)
nycgov_str = re.sub(";", "", nycgov_str)
# convert strings to lists
nycgov_list = nycgov_str.split('\n')
nycgov_list.pop(0)
nycgov_list.pop(len(nycgov_list)-1)
nycgov_list = [ast.literal_eval(i) for i in nycgov_list]
# create a dataframe
nycgov_df = pd.DataFrame(nycgov_list)[[3, 11, 12]]
# only get the data that is 100% under poverty
nycgov_df = nycgov_df.loc[nycgov_df[3] == 'POVERTY']
# if the community district column does not contain any of the boroughs, then, remove it from the dataframe
nycgov_df = nycgov_df[~nycgov_df[11].str.contains(
'Manhattan|Bronx|Brooklyn|Queens|Staten') == False]
nycgov_df[[11, 'Neighborhood']] = nycgov_df[11].str.split(' \(', expand=True)
nycgov_df[11] = nycgov_df[11].str.replace('Staten Island', 'StatenIsland')
temp_df = nycgov_df[11].str.split(" ", 1, expand=True)
temp_df[1] = temp_df[1].str.replace(' & ', ',')
# prefix community district code with Manhattan
temp_df[1] = temp_df[1].str.replace(r'(?is)[0-9]+2', r'M\g<0>')
temp_df[1] = temp_df[1].str.replace(r'(?is)\b\d\b', r'M0\g<0>')
temp_df[1] = temp_df[1].str.replace(r'^[0-9]{1,2}', r'M\g<0>')
# update the community district codes
temp_df.loc[temp_df[0] == 'Bronx', 1] = temp_df[1].str.replace('M', 'B')
temp_df.loc[temp_df[0] == 'Brooklyn', 1] = temp_df[1].str.replace('M', 'K')
temp_df.loc[temp_df[0] == 'Queens', 1] = temp_df[1].str.replace('M', 'Q')
temp_df.loc[temp_df[0] == 'StatenIsland', 1] = temp_df[1].str.replace('M', 'S')
nycgov_df = nycgov_df.join(temp_df)
collist = [12, 1]
nycgov_df = nycgov_df[collist]
nycgov_df.rename(columns={
12: 'nycgov_cd_pov_rate',
1: 'community_districts'
}, inplace=True)
return nycgov_df
###
# Connects to the Census API and gets the population in poverty and total pop, calculates the poverty rate
###
def getCensusPoverty(year, zipcodes):
success = False
while success == False:
try:
url = 'https://api.census.gov/data/' + str(year) + '/acs/acs5?get=B06012_002E,B06012_001E&for=zip+code+tabulation+area:' + str(zipcodes)
print('Connecting to ' + url)
con = urllib2.urlopen(url)
except urllib2.HTTPError, e:
print(e)
year = year - 1
else:
print("Connection a success!")
print(year)
success = True
census_data = con.read()
con.close()
# make a call to the api to see if anything returns. If it doesn't decrement the year
census_data = ast.literal_eval(census_data)
census_data.pop(0)
# convert the response into a dataframe
census_df = pd.DataFrame(census_data)
census_df.rename(columns={
0: 'census_total_pov',
1: 'census_total_pop',
2: 'zcta'
}, inplace=True)
# create the poverty rate column by dividing total in pov by the total pop
census_df['census_pov_rate'] = (pd.to_numeric(
census_df['census_total_pov'])/pd.to_numeric(census_df['census_total_pop']))*100
return census_df