-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpreprocess.py
126 lines (98 loc) · 4.03 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import extract_features
import hashlib, json, re
from nltk import tokenize
from sets import Set
from pprint import pprint
import string
import config
import operator
from nltk.stem.porter import PorterStemmer
def preprocess(rep_id, bills):
'''
Preprocesses all votes/bills for this rep.
This method only calculates the preprocess data if the md5 of it's inputs or preprocess.py are changed.
(Either the bills or the preprocessing method is different.)
Returns:
A dictionary with all preprocessing data to be passed to the feature generation.
'''
loaded = False
# Load our current cached preprocess data
try:
print "Loading preprocess data from store.."
preprocess_data = json.loads(open('preprocess_data/'+rep_id).read())
except Exception as e:
preprocess_data = {}
# If we are using the summary_word_bag
if 'summary_word_bag' not in config.features_to_ignore:
# don't regenerate if we loaded it (takes too long)
if 'summary_word_bag' not in preprocess_data or config.force_preprocess:
print ".. Building summary word bag"
preprocess_data['summary_word_bag'] = generate_summary_word_set(bills)
# If we are using the bill features
if 'bill_feature_set' not in config.features_to_ignore:
print 'Building base feature set for string features'
preprocess_data['bill_feature_set'] = generate_bill_feature_sets(bills)
# Write our new preprocess data and hash values
f = open('preprocess_data/'+rep_id, 'w')
f.write(json.dumps(preprocess_data))
f.close()
return preprocess_data
st = PorterStemmer()
def generate_summary_word_set(bills):
''' Generates a set of frequencies of all words in summaries of bills provided '''
regex = re.compile('[%s]' % re.escape(string.punctuation))
word_set = {}
for bill in bills:
b = bill['id']
summary_text = json.loads(open('bill_summaries/'+b).read())
summary_clean = regex.sub('', summary_text)
words = tokenize.word_tokenize(summary_clean)
for w in words:
if config.stem_words:
w = st.stem(w)
if w in word_set:
word_set[w] += 1
else:
word_set[w] = 1
word_list = []
for w in word_set:
word_list.append((w, word_set[w]))
word_list = sorted(word_list, key=operator.itemgetter(1), reverse=True)
pprint(word_list[:100])
# Apparently we can remove all words only seen once? Interesting I guess. We can comment this in if necessary.
# TODO(john): Add in NLP additions such as removing word endingds, and possibly bigrams
return word_set
def generate_bill_feature_sets(bills):
'''
Generates the mappings of string features in our bill features to a list of binary features.
Returns:
Dictionary as follows: {
"name of the feature" : {
"possible value of feature" : (which bit to turn on)
}
}
--- Ex: ---
{
"lastname": {
"Smith": 0,
"Baker": 4,
"Johnson": 11
}
}
'''
possible_states = {}
for bill in bills:
features = extract_features.extractFeatures(bill)
# Loop through each feature. If it's a string add it to the possible state list for that feature.
for f in features:
if not isinstance(features[f], str): # Only keep track of strings
continue
if f not in possible_states:
possible_states[f] = {}
# Add the value of this feature to the list of possible values for that feature
possible_states[f][features[f]] = 0
# Map each feature value to an integer
for feature in possible_states:
for i, value in enumerate(possible_states[feature]):
possible_states[feature][value] = i
return possible_states