-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathextract_features.py
198 lines (156 loc) · 7.12 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#
# Extract features from a given bill object
import httplib, json, re
import config
from nltk import tokenize
from sets import Set
import string
from datetime import date
from nltk.stem.porter import PorterStemmer
import config
st = PorterStemmer()
import sys
from pprint import pprint
dictj={}
reps=[]
def get_bill(id):
f = open('bill_map/'+str(id))
return json.loads(f.read())
''' Removes unicode characters from string and converts to standard string
Some of our data seems to have unicode characters that won't print '''
def removeNonAscii(s):
return str("".join(i for i in s if ord(i)<128))
def convert_bool_to_int(boolean):
if boolean:
return 1
return 0
def extractFeatures(bill):
'''
Returns:
Dictionary of features representing this bill
'''
# Clean up name (remove dates)
name = bill['sponsor']['name']
sponsor_district= str(bill['sponsor_role']['district'])
match = re.search('[^\[]*', name)
clean_name = name[match.start():match.end()].replace(' ','')
year = int(bill['current_status_date'][0:4])
year_mod2 = int(bill['current_status_date'][0:4]) % 2
year_mod4 = int(bill['current_status_date'][0:4]) % 4
year_mod6 = int(bill['current_status_date'][0:4]) % 6
year_introduced = int(bill['introduced_date'][0:4])
vote_date = bill['current_status_date'].split("-")
vd = date(int(vote_date[0]), int(vote_date[1]), int(vote_date[2]))
introduce_date = bill['introduced_date'].split("-")
id = date(int(introduce_date[0]), int(introduce_date[1]), int(introduce_date[2]))
delta = vd - id;
bill_length = delta.days
bill_is_alive = bill['is_alive']
bill_is_current = bill['is_current']
sponsor_end_year = int(bill['sponsor_role']['enddate'][:4])
sponsor_start_year = int(bill['sponsor_role']['startdate'][:4])
sponsor_gender = bill['sponsor']['gender']
sponsor_is_alive = bill['is_current']
sponsor_party = bill['sponsor_role']['party']
sponsor_has_nickname = bill['sponsor']['nickname'] != ''
sponsor_district= str(bill['sponsor_role']['district'])
sponsor_has_nickname = bill['sponsor']['nickname'] != ''
sponsor_has_twitter= bill['sponsor']['twitterid']!=''
congress = int(bill['congress'])
# NOTE: Any features that we want to distributed into a list of binary features must be a string.
# Also make sure the data is preprocessed when you add a new string feature.
features = {
'sponsor_name': clean_name,
'vote_year': int(bill['current_status_date'][0:4]),
'vote_month': int(bill['current_status_date'][5:7]),
'vote_day': int(bill['current_status_date'][9:]),
'vote_year_m2': year_mod2,
'vote_year_m4': year_mod4,
'vote_year_m6': year_mod6,
'year_introduced': year_introduced,
'bill_length': bill_length,
'bill_is_alive' : bill_is_alive,
'bill_is_current': bill_is_current,
'sponsor_end_year': sponsor_end_year,
'sponsor_start_year': sponsor_start_year,
'sponsor_gender': sponsor_gender,
'sponsor_has_nickname': sponsor_has_nickname,
'sponsor_district':sponsor_district,
'sponsor_has_twitter': sponsor_has_twitter,
'sponsor_party': sponsor_party,
'congress': congress
}
# Clean up any unicode characters into ascii and convert booleans into 0/1
for f in features:
if isinstance(features[f], unicode):
features[f] = removeNonAscii(features[f])
if isinstance(features[f], bool):
features[f] = convert_bool_to_int(features[f])
# Remove any features we're ignoring
for f in config.features_to_ignore:
if f in features:
features.pop(f)
# Sanity check to verify that any features that are int's or float's are never
# None
for f in features:
if isinstance(features[f], int) or isinstance(features[f], float):
if features[f] == None:
raise "Oh god. The "+f+" should be a number but someone has their value as None"
return features # Return dictionary of features
def generate_feature_vector(bill, preprocess_data):
'''
Generates a feature vector in the form our ML Kit takes
Args:
word_set: The set of all words in all training examples. Used for generating bag of words.
'''
feature_vector = [] # Final feature vector
feature_vector_labels = [] # Label for each feature vector bit
# ------------- Summary --------------
# Get our summary feature vector
if 'summary_word_bag' not in config.features_to_ignore:
summary_text = json.loads(open('bill_summaries/'+bill['id']).read())
summary_vector, summary_labels = generate_summary_vector(summary_text, preprocess_data)
feature_vector.extend(summary_vector)
feature_vector_labels.extend(summary_labels)
# ------------- Bill Features ---------------
# We have our features as a mix of different types, we need to compress it into real values
if 'bill_feature_set' not in config.features_to_ignore:
bill_features = extractFeatures(bill)
bill_feature_set = preprocess_data['bill_feature_set']
# Convert all features that are strings into the appropriate bit lists using the preprocess data.
for feature_name in bill_features:
value = bill_features[feature_name]
# If we have a string feature, change to bit string
if feature_name in bill_feature_set:
# Generate blank vector for this feature
vector = [0] * len(bill_feature_set[feature_name])
# If the value for the given feature was present in training, turn on the bit
if value in bill_feature_set[feature_name]:
index = bill_feature_set[feature_name][value] # Int associated with this value
vector[index] = 1
feature_vector.extend(vector)
feature_vector_labels.extend(bill_feature_set[feature_name].keys())
# Just add float/int features
else:
feature_vector.append(value)
feature_vector_labels.append(feature_name)
return (feature_vector, feature_vector_labels)
def generate_summary_vector(summary_text, preprocess_data):
'''
Generates the set of features for the summary of this bill based on this set of words
'''
regex = re.compile('[%s]' % re.escape(string.punctuation))
summary_clean = regex.sub('', summary_text)
words = tokenize.word_tokenize(summary_clean)
word_set = preprocess_data['summary_word_bag']
# Initialize a vector for every possible word
word_vector = {}
for w in word_set:
word_vector[w] = 0
# Fill in the proper word frequencies
for w in words:
if config.stem_words:
w = st.stem(w)
if w in word_set:
word_vector[w] += 1
return (word_vector.values(), word_vector.keys())