-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweather_prepMA.py
120 lines (80 loc) · 3.48 KB
/
weather_prepMA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import pandas as pd
import requests
import json
import os
traffic_dat = pd.read_csv('trafficDAT.csv')
garmon_dat = pd.read_csv('../datathon_data/garmon_0217.csv')
def setPeak(hour):
if hour <= 10 or hour >= 15:
return("Peak")
else:
return("Slow")
garmon_dat['DATEUTC'] = pd.to_datetime(garmon_dat['DATEUTC'])
garmon_dat['period'] = garmon_dat['DATEUTC'].map(lambda x: setPeak(x.hour))
garmon_dat['date'] = garmon_dat['DATEUTC'].dt.date
kept_var = ['ALT', 'TEMPC', 'HUMIDITY', 'WINDCHILLF',
'WINDSPEEDKMH', 'RAININ', 'SOLARRADIATION']
agg_dict = {'LAT':'first', 'LON':'first', 'ALT':'first',
'TEMPC':'median', 'HUMIDITY':'mean', 'WINDCHILLF':'mean', 'WINDSPEEDKMH':'mean',
'RAININ':'mean', 'SOLARRADIATION':'mean'}
garmon_dat = garmon_dat.groupby(['ID', 'date', 'period']).agg(agg_dict).reset_index()
garmon_dat.to_csv('garmonOUT.csv', index=False)
garmon_dat = pd.read_csv('garmonOUT.csv')
garmon_gps = garmon_dat[['ID', 'LAT', 'LON']].drop_duplicates()
## Build dictionary of id:dist for each segment
traffic_gps = traffic_dat[['segment_id', 'lng', 'lat']].drop_duplicates()
def compute_sqeucl(x1, y1, x2, y2):
return (x1 - x2)**2 + (y1 - y2)**2
segment_distD = {}
n = traffic_gps.shape[0]
garmon_ids = np.array(garmon_gps[['ID']]).ravel()
for i in range(n):
current_arr = traffic_gps.iloc[i]
segment = current_arr[0]
current_lng = current_arr[1]
current_lat = current_arr[2]
distances = garmon_gps[['LAT', 'LON']].apply(lambda x:\
compute_sqeucl(x[0], x[1], current_lat, current_lng), axis=1)
distances = np.array(distances).ravel()
dist_dict = {x:y for x, y in zip(garmon_ids, distances)}
segment_distD.update({int(segment):dist_dict})
## Append to the traffic data the weighted mean of the weather covariates
weather_columns = pd.DataFrame()
n = traffic_dat.shape[0]
for i in range(n):
print(str(i/n))
traffic_current = traffic_dat.iloc[i]
date_current = traffic_current['date']
period_current = traffic_current['period']
segment_current = traffic_current['segment_id']
garmon_sub = garmon_dat[(garmon_dat.date.astype('str') == date_current) & (garmon_dat.period == period_current)]
dist_sub = pd.DataFrame({'ID':list(segment_distD[segment_current].keys()),
'Dist':list(segment_distD[segment_current].values())})
garmon_sub = garmon_sub.merge(dist_sub)
wghts = 1/np.array(garmon_sub['Dist'])
c = np.sum(wghts)
garmon_variates = garmon_sub[kept_var]
agg_variates = garmon_variates.apply(lambda x: np.sum(x*wghts/c))
agg_variates = np.array(agg_variates).ravel()
cols = kept_var
if garmon_sub.shape[0] > 0:
row_dict = pd.DataFrame(agg_variates.reshape(1, -1))
row_dict.columns = cols
else:
row_dict = pd.DataFrame(np.repeat(None, len(cols)).reshape(1, -1))
row_dict.columns = cols
weather_columns = pd.concat((weather_columns, row_dict), axis=0)
weather_columns.to_csv('weatherCOLS.csv', index=False)
weather_columns = weather_columns.reset_index()
weather_columns = weather_columns.drop('index', axis=1)
tw_dat = pd.concat((traffic_dat, weather_columns), axis=1)
tw_dat.to_csv('trafficWeather_dat.csv', index=False)
tw_dat.head(20)
## Debugging
np.histogram(np.array(tw_dat['date'].astype(str)).ravel())
np.unique(tw_dat['date'])
np.unique(garmon_dat['date'])
np.unique(traffic_dat['date'])
a = np.where(pd.isna(weather_columns['ALT']))
garmon_dat['date'].iloc[a]