-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlib.py
executable file
·342 lines (322 loc) · 12.2 KB
/
lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import json
import pickle
import multiprocessing
import mlAlgos
import sys
from multiprocessing.pool import ThreadPool
from numpy import array
from pprint import pprint
from scipy.sparse import csr_matrix
from sklearn.cross_validation import KFold
from time import time
#This file provides the framework for easily running different algorithms
# in scikit-learn in a parallel fashion. Also includes functions for
# consuming and formatting original, raw data.
##########################################
# Original data format functions
##########################################
# Reads a json file into python data structures
def readJson(filename):
with open(filename) as f:
return json.load(f)
# Data looks like
# {
# "id": 24717,
# "cuisine": "indian",
# "ingredients": [
# "tumeric",
# "vegetable stock",
# "tomatoes",
# "garam masala",
# "naan",
# "red lentils",
# "red chili peppers",
# "onions",
# "spinach",
# "sweet potatoes"
# ]
# },
# Takes recipes json data, gets set of all possible ingredients,
# converts this set to a sorted list
# and assigns each a number (representing a dimension) to each ingredient
#
# Returns a dictionary where first dim is indexed by ingredient, and second
# dimension is indexed by either 'dim' or 'usageCount'.
# dim is the feature vector dimension representing inclusion of the ingredient
# usageCount is the number of recipes using this ingredient
def genIngredMap(recipes):
ingreds = set()
# Get a set of all ingredients in use
for recipe in recipes:
for ingred in recipe['ingredients']:
ingreds.add(ingred)
# Convert set to sorted list
ingreds = sorted(list(ingreds))
# Create a dict, indexed by ingredient name
# This allows us to look up the vector dimension for a given
# ingredient
ingredMap = dict()
for x in range(len(ingreds)):
# Create a dict - 'dim' is dimension number for ingredient
# Later we'll add usage count
# (i just thought this info was interesting - not super useful)
ingredMap[ingreds[x]] = {'dim':x, 'usageCount':0}
# Add the usage counts to the map
for recipe in recipes:
for ingred in recipe['ingredients']:
ingredMap[ingred]['usageCount'] += 1
return ingredMap
# Conceptually identical to genIngredMap
def genCuisineMap(recipes):
cuisines = set()
for recipe in recipes:
try:
cuisines.add(recipe['cuisine'])
except:
print(recipe)
exit()
cuisines = sorted(list(cuisines))
cuisineMap = dict()
for x in range(len(cuisines)):
cuisineMap[cuisines[x]] = {'dim':x, 'usageCount':0}
for recipe in recipes:
cuisineMap[recipe['cuisine']]['usageCount'] += 1
return cuisineMap
#Converts json recipes to vectors, according to ingredient map and cuisine map
def genVectorRepresentation(recipes,iMap,cMap):
vectors = list()
seenUnknownIngred = False
unknownIngredCount = 0
# Loop through each recipe, vectorize it, and add it to vectors
for recipe in recipes:
# Create a list of zeros of the same size as the number
# of possible ingredients, plus 1 extra slot for label (if train data)
if cMap:
vector = [0.] * (1 + len(iMap))
else:
vector = [0.] * len(iMap)
# Set the dimension to 1 for each present ingredient
for ingred in recipe['ingredients']:
if ingred in iMap:
vector[iMap[ingred]['dim']] = 1.
else:
if not seenUnknownIngred:
pass
#print("Unknown ingredient seen in test data:")
#print(" " + ingred)
seenUnknownIngred = True
unknownIngredCount += 1
# Add the label in the last slot
if cMap:
vector[-1] = cMap[recipe['cuisine']]['dim']
vectors.append(vector)
return vectors, unknownIngredCount
##########################################
# sklearn format functions
##########################################
#Reads original data in, converts to earlier format (which is no longer in use)
# and finally converts it into a format suitable for scikit-learn
def toSklearnFormat(trainFile, testFile):
#Read in json files
train = readJson(trainFile)
test = readJson(testFile)
# Grab the ingredient and cuisine maps
# (cuisine map does not exist if input file is test data)
iMap = genIngredMap(train)
cMap = genCuisineMap(train)
#Convert json into a list of vector examples, and maps from ingredients
# to their indices and cuisines into their label numbers
trainVectors, unknownTrain = genVectorRepresentation(train, iMap, cMap)
testVectors, unknownTest = genVectorRepresentation(test, iMap, None)
#Create an object to store everything
dataset = dict()
dataset['unknownTestIngredCount'] = unknownTest
#Add the training data target names (label names)
dataset['target_names'] = array(list(cMap.keys()))
#Add the feature names (ingredient names) (applies to test & train)
dataset['feature_names'] = array(list(iMap.keys()))
target = list()
data = list()
test = list()
#Split the features from the label, and add each to a separate list
for traindatum in trainVectors:
data.append(traindatum[:-1])
target.append(traindatum[-1])
for testdatum in testVectors:
test.append(testdatum)
#Add the examples to the dataset object
dataset['data'] = csr_matrix(array(data))
#Add the target labels to the dataset object
dataset['target'] = array(target)
#Add the test data to the dataset object
dataset['test'] = csr_matrix(array(test))
return dataset
#Print statistics on dataset
def printSklearnDatasetStats(dataset):
print()
print("Total dataset statistics:")
print(" Training cuisine count: " + str(len(dataset['target_names'])))
print(" Training ingredient count: " + str(len(dataset['feature_names'])))
print(" Training recipes: " + str(dataset['data'].shape[0]))
print(" Test recipes: " + str(dataset['test'].shape[0]))
print(" Unknown Ingredients in Test Recipes: " + str(dataset['unknownTestIngredCount']))
##########################################
# serialization functions
##########################################
#Writes python object to disk
def serialize(dataset, filename):
pickle.dump(dataset, open(filename,'wb'), protocol=-1)
#Reads python object from disk
def unserialize(serializedFile):
return pickle.load(open(serializedFile, 'rb'))
#Splits input data into folds, and writes folds to disk
def writeKfoldSets(fullTrainingData, fullTrainingTarget, foldNum):
#Use kfold to split into foldNum (train,test) sets
count = 0
kf = KFold(fullTrainingData.shape[0], n_folds=10)
for train,test in kf:
print(train,test)
trainSet = dict()
testSet = dict()
trainSet['data'] = csr_matrix(fullTrainingData[train])
testSet['data'] = csr_matrix(fullTrainingData[test])
trainSet['target'] = fullTrainingTarget[train]
testSet['target'] = fullTrainingTarget[test]
trainFile = "train_" + str(count) + ".dat"
testFile = "test_" + str(count) + ".dat"
trainSet = serialize(trainSet, trainFile)
testSet = serialize(testSet, testFile)
count += 1
# trainSet={'data':exampleVectors, 'target':listOflabels}
##########################################
# cross validation functions
##########################################
# Makes dictionaries hashable (for use with set())
class hashabledict(dict):
def __hash__(self):
return hash(tuple(sorted(self.items())))
# Generates combinations of possible parameter values
def traverse(a, level, l, res):
if level == len(a):
return res.append(l)
for x in a[level]:
ll = list(l)
ll.append(x)
traverse(a, level+1, ll, res)
# Driver for traverse - generates combinations of possible parameter values
def genCombos(*valueNamePairs):
combos = list()
vals = [pair[0] for pair in valueNamePairs]
labels = [pair[1] for pair in valueNamePairs]
traverse(vals, 0, [], combos)
dictCombos = list()
for combo in combos:
dictCombo = dict()
for x in range(len(combo)):
dictCombo[labels[x]] = combo[x]
dictCombos.append(hashabledict(dictCombo))
return dictCombos
# Collects all folds for each paramCombo and aggregates the results
#
# resList like:
# [(paramCombo1, foldNum1, runTime1, correctTestCount11, totalTestCount11),
# (paramCombo1, foldNum2, runTime2, correctTestCount12, totalTestCount12),
# ...]
def averageFolds(resList):
comboSets = set()
results = list()
for res in resList:
comboSets.add(res[0])
pprint(comboSets)
for combo in comboSets:
correct = total = totalTime = 0
for res in resList:
if res[0] == combo:
totalTime += res[2]
correct += res[3]
total += res[4]
results.append([combo, totalTime, correct, total, 100*correct/total])
return results
# Main cross validate function. Should be called from a "crossValidate<Algo>()"
# function. This function provides the mechanics for parallelization and
# results aggregation/printing
#
# Return value indicates whether the process was aborted (True==aborted).
# On abort, caller should call exit(-1) so outstanding threads terminate
def crossValidate(function, paramValuesLabelPairs):
aborted = True
finished = False
#Set up threading
num_cpus = multiprocessing.cpu_count()
#num_cpus = 32
p = ThreadPool(processes=num_cpus)
if paramValuesLabelPairs:
paramCombos = genCombos(*paramValuesLabelPairs)
else:
paramCombos = [0]
rs = []
results = []
# Setup progress "bar"
length = len(paramCombos)*10
print("Validating ", int(length/10), " parameter combinations...")
sys.stdout.flush()
try:
for paramCombo in paramCombos:
for foldNum in range(10):
#Queue algo on each paramCombo/fold
r = p.apply_async(function, args=(foldNum, paramCombo))
rs.append(r)
finishedCount = 0
progressStr = " Validation Progress: "
progressStr += str(round(100*finishedCount/length,2)) + "% \r"
sys.stdout.write(progressStr)
sys.stdout.flush()
#Wait for queued tasks to finish, print progress along the way
for r in rs:
r.wait()
finishedCount += 1
progressStr = " Validation Progress: "
progressStr += str(round(100*finishedCount/length,2)) + "% \r"
sys.stdout.write(progressStr)
sys.stdout.flush()
output = r.get()
results.append(output)
print()
except KeyboardInterrupt:
#Print partial results and exit
print("Aborted - printing individual fold results:")
pprint(results)
avgResults = sorted(averageFolds(results), key=lambda x: x[-1])
pprint(avgResults)
sys.stdout.flush()
return aborted
else:
p.close()
p.join()
#print results
avgResults = sorted(averageFolds(results), key=lambda x: x[-1])
print("Individual fold results:")
pprint(results)
print("Aggregate Cross Validation results:")
pprint(avgResults)
return finished
#Loads train and test data for a given fold number
def getFoldData(foldNum):
trainFile = "train_" + str(foldNum) + ".dat"
testFile = "test_" + str(foldNum) + ".dat"
trainData = unserialize(trainFile)
testData = unserialize(testFile)
return trainData,testData
#Runs prediction on the testData, and returns a tuple of all relevant results
def predict(classifier, paramCombo, foldNum, startTime, testData):
pred = classifier.predict(testData['data'].toarray())
#Compare predicted labels with actual labels, maintaining a count of matches
correctCount = 0
for x in range(len(testData['target'])):
if(pred[x] == testData['target'][x]):
correctCount += 1
return (paramCombo,
foldNum,
time() - startTime,
correctCount,
len(testData['target']))