-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_db.py
248 lines (202 loc) · 9.93 KB
/
update_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# command line args (what to be updated & file(s) to use): legislators; contributions
import requests, os, argparse
from model import connect_to_db, db, Legislator, Contrib_leg, Contributors, Type_contrib, Contrib_pac, Industry
from server import app
from helper_functions import get_first_term_year
from string import capwords
import psycopg2
def update_legislators():
""" input = file handle from which to look for updates to legislators
output = nothing
checks for new legislators. they can be 1) filling empty seat or 2) replacing an occupied seat
"""
currently_loaded = db.session.query(Legislator.leg_id).all() #list of tuples => [(leg_id (known as crp_id in sunlight data,), (leg_id,)]
loaded_ids = {tup[0]: True for tup in currently_loaded}
# use this to keep track of currently serving members, we can then check against ones in db.
# if its in our db but not in this list, then we need to delete from our db
current_mem_ids = {}
apikey = os.environ['SUNLIGHT_API_KEY']
query_params = { 'apikey': apikey,
'per_page': 'all',
}
endpoint = 'https://congress.api.sunlightfoundation.com/legislators/'
# gets all current members of congress. need to find ones we don't already have in our system
response = (requests.get(endpoint, params=query_params)).json()
for entry in response['results']:
crp_id = entry['crp_id']
bioguide_id = entry['bioguide_id']
first = entry['first_name']
last = entry['last_name']
nickname = entry['nickname']
suffix = entry['name_suffix']
title = entry['title']
state = entry['state']
party = entry['party']
chamber = entry['chamber'].capitalize()
twitter_id = entry.get('twitter_id', None)
facebook_id = entry.get('facebook_id', None)
if chamber =="Senate":
sen_rank = entry['state_rank'].capitalize()
district = None
else:
district = int(entry['district']) #only for house members
sen_rank = None
off_website = entry['website']
open_cong_url = None
pict_link = "https://theunitedstates.io/images/congress/225x275/"+bioguide_id+".jpg"
first_elected = get_first_term_year(crp_id, apikey) #API call to SLF to get the year
current_mem_ids[crp_id] = True
# if the member is not already loaded, load it
if crp_id not in loaded_ids:
temp_legislator_object = Legislator(leg_id=crp_id, bioguide_id=bioguide_id, first=first, last=last, nickname=nickname, suffix=suffix, title=title, state=state, party=party, chamber=chamber, twitter_id=twitter_id, facebook_id=facebook_id, district=district, sen_rank=sen_rank, off_website=off_website, open_cong_url=open_cong_url, pict_link=pict_link, first_elected=first_elected)
db.session.add(temp_legislator_object)
# if member is in our db but isn't current member, remove from db
for x in loaded_ids.keys():
if x not in current_mem_ids:
member = Legislator.query.filter_by(leg_id = x).first()
# get all contributions to that member and delete from table
contributions = Contrib_leg.query.filter_by(leg_id= x).all()
for item in contributions:
db.session.delete(item)
db.session.delete(member)
db.session.commit()
def update_indiv_to_leg(file_path):
""" will update individual to member contributions table & individual giver table (as necessary) """
cont_id_dict = {}
file_open = open(file_path)
for index, line in enumerate(file_open):
value = line.split("|,|")
# TODO: change this to checking if val[4] (crp_id) is in our legislator table (leg_id)
if Legislator.query.get(value[4]):
# per CRP documentation. to make sure only get indiv. donors, check that contrib_id has a value. first need to strip whitespace.
# this worked on the test set but not when running the full thing, non-I-contributors still got in. unsure why. so, just deleted
# from db directly by deleting all from contrib_legislators table where id was empty string.
value[2].replace(" ", "")
if value[2] != "":
contrib_id = value[2].strip()
FEC_trans_id = value[1]
name = capwords(value[3])
contrib_type = "I"
contrib_state = value[9]
cycle = value[0].strip('|')
leg_id = value[4]
#first split of line makes it so there is one grouping of industry, data & amount given so need to further parse that.
split_ind_amt_data = value[7].split(',')
if split_ind_amt_data[0]:
if split_ind_amt_data[0] == "NONE":
industry_id = None
else:
industry_id = split_ind_amt_data[0].strip('|')
if not Industry.query.get(industry_id):
industry_id = "unknown"
else:
industry_id = None
amount = int(float(split_ind_amt_data[2]))
# need to weed out multiples of the contributor going into table - otherwise fails PK constraint on contributor_id.
# add the id as a key and set initial value to 1 but increment value every time you have someone with that id.
cont_id_dict[contrib_id] = cont_id_dict.get(contrib_id, 0) + 1
if industry_id[0:2] != "Z9":
#allow for the person to be added to Contributor table first time encountered but not after.
# also check for contributor in our table already. .first() returns the item or None so only add if we get None
# we don't want to add to the Contributors but we do want to add the amount to Contrib_leg table
if cont_id_dict.get(contrib_id) == 1 and not Contributors.query.filter_by(contrib_id=contrib_id).first():
temp_contrib_obj = Contributors(contrib_id=contrib_id, name=name, contrib_type=contrib_type, industry_id=industry_id, contrib_state=contrib_state)
db.session.add(temp_contrib_obj)
temp_contrib_leg_obj = Contrib_leg(contrib_id=contrib_id, FEC_trans_id=FEC_trans_id, leg_id=leg_id, amount=amount, cycle=cycle)
db.session.add(temp_contrib_leg_obj)
if index % 100 == 0:
db.session.commit()
db.session.commit()
print "individuals data successfully added ", file_path
file_open.close()
def update_pac_to_leg(file_path):
""" updates/adds to PAC to legislator contributions """
file_open = open(file_path)
for index, line in enumerate(file_open):
value = line.split(",")
leg_id = value[3].strip('|')
if Legislator.query.get(leg_id):
cont_type = value[7].strip('|')
industry_id = value[6].strip('|')
#z9 and z4 are considered non-contribution types -> transfers
if industry_id[0:2] != "z9" and industry_id[0:2] != "z4":
if cont_type != "24A" and value[7] != "24N":
cycle = value[0].strip('|')
FEC_trans_id = value[1].strip('|')
contrib_id = value[2].strip('|')
amount = int(float(value[4]))
temp_contrib_leg_obj = Contrib_leg(contrib_id=contrib_id, FEC_trans_id=FEC_trans_id, leg_id=leg_id, amount=amount, cycle=cycle)
db.session.add(temp_contrib_leg_obj)
if index % 100 == 0:
db.session.commit()
db.session.commit()
print "pac contribution data successfully added ", file_path
file_open.close()
def update_pac_contributors(file_path):
""" updates PAC contributor info table (as necessary)"""
# will need to check if contributor id exists, if not, add otherwise, pass
cont_id_dict = {}
file_open = open(file_path)
for index, line in enumerate(file_open):
value = line.split(",")
contrib_id = value[1].strip('|')
contrib_type = "C"
#check if contributor has been entered by checking db and dict of those already in db
if contrib_id not in cont_id_dict and not Contributors.query.get(contrib_id):
cont_id_dict.setdefault(contrib_id, True)
name = value[2].strip('|')
if value[9]:
industry_id = value[9].strip("|")
if not Industry.query.get(industry_id):
industry_id = "unknown"
else:
industry_id = "unknown"
temp_contrib_obj = Contributors(contrib_id=contrib_id, industry_id=industry_id, name=name, contrib_type=contrib_type)
db.session.add(temp_contrib_obj)
db.session.commit()
print "pac contributor info data successfully added", file_path
file_open.close()
def update_top_contributors():
db_connection = psycopg2.connect("dbname='contributions' user='corey' host='localhost'")
db_cursor = db_connection.cursor()
# db_cursor.execute("DELETE from contributors where contrib_id = ''")
# db_cursor.execute("DELETE from contrib_legislators where contrib_id = ''")
# db_connection.commit()
# this is where we'll update top contributors
currently_loaded = db.session.query(Legislator.leg_id).all() #list of tuples => [(leg_id (known as crp_id in sunlight data,), (leg_id,)]
loaded_ids = {tup[0]: True for tup in currently_loaded}
for item in loaded_ids:
#get the member object from the database
try:
member = Legislator.query.filter_by(leg_id = item).first()
#get the dictionary to be stored as json using method
contribution_dict = member.create_contribution_dict()
# add to the legislator table
member.top_contributors = contribution_dict
db.session.commit()
except Exception, e:
# print "Exception: ", e
# print "not loaded: ", item
continue
def main():
parser = argparse.ArgumentParser(description="update congressional money trail db")
parser.add_argument("operation", help='define what you want to update. options are "legislators", "pac_contributions", "indiv_contributions", or "update_top_contributors"')
args = parser.parse_args()
if args.operation.lower() == "legislators":
update_legislators()
elif args.operation.lower() == "pac_contributions":
# this will need additional arguments - the file names from CRP
file_handle_contributions = raw_input("Please indicate source file for PAC contribution data: ")
file_handle_PACs = raw_input("Please indicate source file for PAC informational data: ")
update_pac_to_leg(file_handle_contributions)
update_pac_contributors(file_handle_PACs)
elif args.operation.lower() == "indiv_contributions":
file_handle = raw_input("Please indicate source file for individual data: ")
update_indiv_to_leg(file_handle)
elif args.operation.lower() == "update_top_contributors":
update_top_contributors()
else:
print "please use 'legislators', 'pac_contributions' or 'indiv_contributions', or 'update_top_contributors' as additional argument"
if __name__ == "__main__":
connect_to_db(app)
main()