forked from mggg/submission-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch.py
140 lines (134 loc) · 6.91 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# more imports than we need
# import geopandas as gpd
# import maup
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json
import csv
import io
import pydantic
from pydantic import BaseModel
from datetime import datetime as dt
from typing import Tuple
class Submission(pydantic.BaseModel):
"""
pydantic class containing districtr link, districtr plan (assignment), ...
the type of the plan
"""
link: str
districtr_plan: dict # dict: districtr json obj
plan_type: str
id: str
def submissions(ids_url: str, plans_url: str, cois_url: str,
wr_url: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Takes in endpoint for all districtr ids in a portal along with csv api ...
calls for plans, cois, and written submissions, and retrieves filled pd ...
dataframes for each submission type with metadata and districtr assignments
"""
submissions = retrieve_submission_ids_json(ids_url)
submissions.sort(key=lambda x: str(x.id)) # sorts submission jsons by id
plan_submissions = [sub.districtr_plan for sub in submissions #filters plan
if sub.plan_type == "plan"]
coi_submissions = [sub.districtr_plan for sub in submissions #filters cois
if sub.plan_type == "coi"]
plans_df = csv_read(plans_url) # gathers plan metadata in df
cois_df = csv_read(cois_url) # gathers coi metadata in df
written_df = csv_read(wr_url) # gathers written metadata in df
assert len(plan_submissions) == len(plans_df)
assert len(coi_submissions) == len(cois_df)
# parse for plan id and add in submission dfs
plans_df['plan_id'] = plans_df["link"].map(
lambda link: link.split("/")[-1].split("?")[0])
cois_df['plan_id'] = cois_df["link"].map(
lambda link: link.split("/")[-1].split("?")[0])
# sort dfs by plan id to correctly join w/ json information
plans_df = plans_df.sort_values(by=['plan_id'], ascending=True)
cois_df = cois_df.sort_values(by=['plan_id'], ascending=True)
# join in districtr json assignments into 'districtr_data column'
plans_df['districtr_data'] = plan_submissions
cois_df['districtr_data'] = coi_submissions
# make datetime fields parseable:
plans_df['datetime'] = plans_df['datetime'].map( lambda datetime: (
datetime.split("+")[0] + " +" + datetime.split("+")[1].split(" ")[0]))
cois_df['datetime'] = cois_df['datetime'].map( lambda datetime: (
datetime.split("+")[0] + " +" + datetime.split("+")[1].split(" ")[0]))
written_df['datetime'] = written_df['datetime'].map( lambda datetime: (
datetime.split("+")[0] + " +" + datetime.split("+")[1].split(" ")[0]))
# # convert datetime fields from str's to datetime objects in all dataframe
plans_df['datetime'] = plans_df['datetime'].map(lambda datetime: (
dt.strptime(datetime, '%a %b %d %Y %X %Z %z')))
cois_df['datetime'] = cois_df['datetime'].map(lambda datetime: (
dt.strptime(datetime, '%a %b %d %Y %X %Z %z')))
written_df['datetime'] = written_df['datetime'].map(lambda datetime: (
dt.strptime(datetime, '%a %b %d %Y %X %Z %z')))
# return relevant dataframes
return plans_df, cois_df, written_df
def plan_read(plan_id: int) -> dict: #(dict: json obj)#
"""
takes in plan_id string, makes api call w/ plan_id to the planRead funct...
in netlify, and returns the data associated with the plan_id in JSON format
"""
url = "https://districtr.org/.netlify/functions/planRead?id=%s" % plan_id
r = requests.get(url)
data = json.loads(r.text)
return data
def retrieve_submission_ids_json(url: str) -> list: #list: list[Submission]
"""
retrieveSubmissionJson takes a url (an endpoint to a given state's...
submission portal), returns a list of filled Submission objects
"""
# TODO: temp fix for the purposes of user-agent api call barrier
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'}
r = requests.get(url, headers=headers)
subs_json = json.loads(r.text)
submissions = []
for ids in subs_json['ids']:
# Phase 1, retrieve link, id, type of plan
plan_link = ids['link']
plan_id = plan_link.split("/")[-1].split("?")[0]
plan_type = ids['type']
# Phase 2, fill submission with phase 1 + the ditrictr plan(assignment)
submissions.append(
Submission(link=plan_link, plan_type=plan_type,
id=plan_id, districtr_plan=plan_read(plan_id)))
return submissions
def csv_read(url: str) -> pd.DataFrame:
"""
takes in a url (api endpt to query on given submission portal) to find ...
in csv form data from the portal, and returns a pandas dataframe filled ...
with the portal info
"""
# TODO: temp fix for the purposes of user-agent api call barrier
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'}
r = requests.get(url, headers=headers).content
read_file = pd.read_csv(io.StringIO(r.decode('utf-8')))
return read_file
def coi_submissions(ids_url: str, cois_url: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Takes in endpoint for only coi districtr ids in a portal along with csv api ...
calls for cois retrieves filled pd dataframes for each coi submission with...
metadata and districtr assignments
"""
submissions = retrieve_submission_ids_json(ids_url)
submissions.sort(key=lambda x: str(x.id)) # sorts submission jsons by id
coi_submissions = [sub.districtr_plan for sub in submissions #filters cois
if sub.plan_type == "coi"]
cois_df = csv_read(cois_url) # gathers coi metadata in df
assert len(coi_submissions) == len(cois_df)
# parse for plan id and add in submission dfs
cois_df['plan_id'] = cois_df["link"].map(
lambda link: link.split("/")[-1].split("?")[0])
# sort dfs by plan id to correctly join w/ json information
cois_df = cois_df.sort_values(by=['plan_id'], ascending=True)
# join in districtr json assignments into 'districtr_data column'
cois_df['districtr_data'] = coi_submissions
# make datetime fields parseable:
cois_df['datetime'] = cois_df['datetime'].map( lambda datetime: (
datetime.split("+")[0] + " +" + datetime.split("+")[1].split(" ")[0]))
# # convert datetime fields from str's to datetime objects in all dataframe
cois_df['datetime'] = cois_df['datetime'].map(lambda datetime: (
dt.strptime(datetime, '%a %b %d %Y %X %Z %z')))
# return relevant dataframes
return cois_df