-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutilities.py
408 lines (357 loc) · 13.3 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
import os
import sys
import json
import numpy as np
from nltk.tokenize import word_tokenize
import skpt.skipthoughts as sk
from solver import LassoSolver
#---------------------------------------------------------------------------
# Utility Functions
#---------------------------------------------------------------------------
def _prog(strn, at, total):
"""Prints progress."""
p = round(float(100*at)/total, 2)
if at>=total:
sys.stdout.write(strn+' ['+str(p)+'%]...\n')
else:
sys.stdout.write(strn+' ['+str(p)+'%]...\r')
sys.stdout.flush()
def _load(file):
"""Loads a JSON file.
Args:
file (str): Path to JSON file.
Returns:
obj: The loaded object.
"""
with open(file, 'r') as f:
data = json.load(f)
return data
def _save(data, file):
"""Saves a list or dict in a JSON file.
Args:
data (list:dict): A python list or dict.
file (str): Path to output JSON file.
"""
with open(file, 'w') as f:
json.dump(data, f)
def _clean(sentence):
"""Cleans a given sentence by separating the tokens by spaces.
Args:
sentence (str): A general str sentence.
Returns:
str: The cleaned sentence.
"""
sentence = word_tokenize(str(sentence).lower())
return ' '.join(sentence)
def _overwrite_dict(dict_dist, dict_src):
"""Overwrite `dict_dist` with `dict_src`."""
for k in dict_dist.keys():
dict_dist.pop(k)
for k in dict_src.keys():
dict_dist[k] = dict_src[k]
def _get_files_paths(directory, extension=None):
"""Traverse a folder and its subdirectories.
Args:
directory (str): A path to a directory.
extension (str, optional): The extension of the target files. Defaults to None.
Returns:
list: The list of all files with the specified extension.
if `extension` is None, it returns the list of all files.
"""
files = []
for f in os.listdir(directory):
file = os.path.join(directory, f)
if os.path.isfile(file):
if extension:
if file[file.rfind('.')+1:]!=extension:
continue
files.append(file)
else:
subfiles = _get_files_paths(file)
files.extend(subfiles)
return files
def _apply_on_each(dataset, keys, funcs, output_file=None, force=False, verbose=True):
"""Apply a function on each element in the dataset.
Args:
dataset (dict): A dataset dictionary with keys {'data':{'question'}}.
keys (list): A list of keys under the key `data` in the dataset.
funcs (list): A list of functions to call on each element for each key in `keys`.
output_file (str, optional): Path to output JSON file. Defaults to None.
force (bool, optional): Forces writing to output file. Defaults to False.
verbose (bool, optional): Forces printing progress. Defaults to True.
"""
if not force and output_file and os.path.isfile(output_file):
return
for i in range(len(dataset['data']['question'])):
if verbose and i%500==0: _prog('processing questions', i+1, len(dataset['data']['question']))
for j in range(len(keys)):
dataset['data'][keys[j]][i] = funcs[j](dataset['data'][keys[j]][i])
if verbose: _prog('processing questions', i+1, len(dataset['data']['question']))
if output_file:
_save(dataset, output_file)
def _remove_file(dataset, indx):
"""Removes all the questions of a specific file from the dataset.
Args:
dataset (dict): The dataset.
indx (int): The index of the file.
"""
s = sum([l for l in dataset['lengths'][:indx]])
count = dataset['lengths'][indx]
keys = dataset['data'].keys()
for k in keys:
dataset['data'][k] = dataset['data'][k][:s]+dataset['data'][k][s+count:]
dataset['files'] = dataset['files'][:indx]+dataset['files'][indx+1:]
dataset['lengths'] = dataset['lengths'][:indx]+dataset['lengths'][indx+1:]
def _occurrences_indices(lst, strn):
"""Find the indecies of all the occurrences of `strn` in .
Args:
lst (list): A list of of str.
strn (str): A keyword.
Returns:
list: Indices of the occurrences.
"""
rslt = []
for i in range(len(lst)):
if strn.lower() in lst[i].lower():
rslt.append(i)
return rslt
#---------------------------------------------------------------------------
# Handy Functions
#---------------------------------------------------------------------------
def extract_vqa(folder, output_file=None, force=False, verbose=True):
"""Extract all the questions in VQA dataset into a JSON file.
Args:
folder (str): Path to VQA dataset directory.
output_file (str, optional): Path to output JSON file. Defaults to None.
force (bool, optional): Forces writing to the output file. Defaults to False.
verbose (bool, optional): Forces printing progress. Defaults to True.
Returns:
dict: The loaded dataset with keys {'files', 'lengths', 'data':{'question',...}}.
"""
if verbose: print('[Extract VQA dataset]')
if not force and output_file and os.path.isfile(output_file):
return _load(output_file)
if os.path.isfile(folder):
files = [folder]
else:
files = _get_files_paths(folder, extension='json')
if verbose: print('processing '+str(len(files))+' files...')
extract = {
'files':[0]*len(files),
'lengths':[0]*len(files),
'data':{}, #parallel arrays
}
accum = 0
for i in range(len(files)):
if verbose: print('processing `'+files[i]+'`...')
#add file name to extract
extract['files'][i] = files[i][files[i].rfind('/')+1:]
#add length of data to extract
data = _load(files[i])['questions']
extract['lengths'][i] = len(data)
#add the keys of the data
if i == 0:
keys = data[0].keys()
for k in keys:
extract['data'][k] = [0]*len(data)
else:
for k in keys:
extract['data'][k].extend([0]*len(data))
#fill in the data
for j in range(accum, accum+len(data)):
if verbose and j%500==0: _prog('extracted questions', j+1-accum, len(data))
for k in keys:
extract['data'][k][j] = data[j-accum][k]
accum += len(data)
if verbose: _prog('extracted questions', j+1-accum, len(data))
if output_file:
if verbose: print('saving to `'+output_file+'`...')
_save(extract, output_file)
if verbose: print('done')
return extract
def preprocess(dataset, output_file=None, force=False, verbose=True):
"""Preprocess all the questions in the dataset.
This will add `_copy_of` key as {'data':{'_copy_of'}} to `dataset`.
Args:
dataset (dict): A dataset dictionary with keys {'data':{'question'}}.
output_file (str, optional): Path to output JSON file. Defaults to None.
force (bool, optional): Forces writing to file. Defaults to False.
verbose (bool, optional): Forces printing progress. Defaults to True.
"""
if verbose: print('[Preprocess dataset]')
if not force and output_file and os.path.isfile(output_file):
_overwrite_dict(dataset, _load(output_file))
return
if force or dataset['data'].get('_copy_of', None) == None:
#clean all questions
_apply_on_each(dataset=dataset, keys=['question'], funcs=[_clean], output_file=output_file, force=force, verbose=verbose)
#mark all repeated questions as copy of the first occurrence
N = sum([l for l in dataset['lengths']])
if verbose: print('dealing with duplicate questions...')
##get a list of list of indecies of the questions that are equal
questions = [(i,dataset['data']['question'][i].__hash__()) for i in range(N)]
questions = sorted(questions, key=lambda x:x[1])
indecies = []
i = 0
while i < N:
if verbose: _prog('finding duplicate questions', i+1, N)
indecies.append([i])
for j in range(i+1,N):
if questions[i][1]!=questions[j][1]:
i = j-1
break
indecies[-1].append(j)
i = i+1
if verbose: _prog('finding duplicate questions', i+1, N)
##mark them as copies
dataset['data']['_copy_of'] = [0]*N
for ind in indecies:
ind = sorted(ind)
for i in ind:
dataset['data']['_copy_of'][i] = ind[0]
if output_file:
if verbose: print('saving to `'+output_file+'`...')
_save(dataset, output_file)
if verbose: print('done')
def embed(dataset, output_file=None, force=False, verbose=True):
"""Embed all the questions in the dataset into skip thoughts vector space.
This will add `embedded` key as {'data':{'_embedded'}} to `dataset`.
Args:
dataset (dict): A dataset dictionary with keys {'data':{'question'}}.
output_file (str, optional): Path to output JSON file. Defaults to None.
force (bool, optional): Forces embedding the questions. Defaults to False.
verbose (bool, optional): Forces printing progress. Defaults to True.
"""
if verbose: print('[Embed VQA dataset]')
if not force and output_file and os.path.isfile(output_file):
_overwrite_dict(dataset, _load(output_file))
for i in range(len(dataset['data'])):
dataset['data']['_embedded'][i] = np.array(dataset['data']['_embedded'][i])
return
if force or dataset['data'].get('_embedded', None) == None:
model = sk.load_model()
N = sum([l for l in dataset['lengths']])
dataset['data']['_embedded'] = [None]*N
for i in range(N):
if verbose: _prog('embedding questions', i+1, N)
if dataset['data']['_copy_of'][i] != i:
dataset['data']['_embedded'][i] = dataset['data']['_embedded'][dataset['data']['_copy_of'][i]]
else:
encd = sk.encode(model, [dataset['data']['question'][i]], verbose=False)
dataset['data']['_embedded'][i] = np.reshape(encd, (encd.shape[1],))
if verbose: _prog('embedding questions', i+1, N)
if output_file:
if verbose: print('saving to `'+output_file+'`...')
for i in range(len(dataset['data']['_embedded'])):
dataset['data']['_embedded'][i] = [float(x) for x in dataset['data']['_embedded'][i]]
_save(dataset, output_file)
for i in range(len(dataset['data']['_embedded'])):
dataset['data']['_embedded'][i] = np.array(dataset['data']['_embedded'][i])
if verbose: print('done')
def vqa_subset(vqa_file, output_file=None, force=False, abstract=False, real=False,
train=False, validation=False, test=False, dev=False, open_ended=False, multiple_choice=False):
"""Load subset of VQA dataset from VQA JSON file.
Args:
vqa_file (str): Path to VQA JSON file.
output_file (str, optional): Path to output JSON file. Defaults to None.
force (bool, optional): Forces saving the subset. Defaults to False.
abstract (bool, optional): Include abstract questions. Defaults to False.
real (bool, optional): Include real questions. Defaults to False.
train (bool, optional): Include train questions. Defaults to False.
validation (bool, optional): Include validation questions. Defaults to False.
test (bool, optional): Include test questions. Defaults to False.
dev (bool, optional): Include test-dev questions. Defaults to False.
open_ended (bool, optional): Include OpenEnded questions. Defaults to False.
multiple_choice (bool, optional): Include MultipleChoice questions. Defaults to False.
Returns:
dict: The loaded dataset.
"""
if verbose: print('[Extract subset of VQA dataset]')
if not force and output_file and os.path.isfile(output_file):
return _load(output_file)
dataset = _load(vqa_file)
files = dataset['files']
def process(cond, strn):
if not cond:
occ = _occurrences_indices(files, strn)
for o in occ:
_remove_file(dataset, o)
process(abstract, '_abstract_')
process(train, 'train20')
process(validation, 'val20')
process(test, 'test20')
process(dev, 'test-dev20')
process(open_ended, 'OpenEnded')
process(multiple_choice, 'MultipleChoice')
if not real:
occ = _occurrences_indices(files, '_abstract_')
for i in range(len(files)):
if i not in occ:
_remove_file(dataset, i)
if output_file:
_save(dataset, output_file)
return dataset
def get_embedding(dataset, chunks=1, output=None, force=False):
"""Merge the embeddings of the entire dataset.
Args:
dataset (dict): The dataset.
chunks (int, optional): Total number of chunks. Defaults to 1.
output (str, optional): Path to NPY file or to a directory if `chunks`>1. Defaults to None.
force (bool, optional): Forces embedding the questions. Defaults to False.
Returns:
numpy.ndarray: The embeddings as a matrix.
"""
if verbose: print('[Extract embedding]')
if chunks > 1:
if not os.path.exists(output):
if verbose: print('creating folder `'+output+'`...')
os.makedirs(output)
else:
files = os.listdir(output)
if not force and len(files)>=chunks:
try:
result = [0]*chunks
for i in range(chunks):
result[i] = np.load(os.path.join(output, str(i)+'.npy'))
return result
except:
pass
else:
if not force and output and os.path.isfile(output):
return np.load(output)
m = dataset['data']['_embedding'][0].shape[0]
N = sum([l for l in dataset['lengths']])
indecies = [0]*N
n = 0
for i in range(N):
if dataset['data']['_copy_of'][i]==i:
indecies[n] = i
n += 1
indecies = indecies[:n]
items_per_chunk = np.ceil(float(n)/chunks)
result = [0]*chunks
for c in range(chunks):
mat = np.zeros(m, items_per_chunk)
for i in range(c*items_per_chunk, min((c+1)*items_per_chunk, N)):
if dataset['data']['_copy_of'][i]==i:
mat[:,i-c*items_per_chunk] = dataset['data']['_embedding'][i]
result[c] = mat
if len(result)==1:
if output:
np.save(output, result[0])
return result[0]
else:
if output:
for i in range(len(result)):
np.save(os.path.join(output, str(i)+'.npy'), result[i])
return result
'''
#decompose an embedded question into basic questions given questions and their embeddings
#return the list of dictionaries of top n decomposed questions with their score
def decompose(question, questions, embeddings, top_n=20, l=1e-5):
slvr = LassoSolver(l=l)
x = slvr.solve(embeddings, question)
decomposition = [(questions[i], float(x[i])) for i in range(len(x)) if float(x[i])!=0]
decomposition = sorted(decomposition, key=lambda x:x[1], reverse=True)
return decomposition[:top_n]
'''