-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
395 lines (300 loc) · 14.4 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Bidirectional, Conv1D, concatenate
from keras.layers import Dropout, SpatialDropout1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Model
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize
from scipy import sparse
import numpy as np
from tools import evaluate
# TODO : add spatial dropout and/or batch norm
# TODO : stack LSTM layers in bidirectional_lstm()
# TODO : add regularization to limit over-fitting
# TODO : add normalization and dense layer after auxiliary input?
##########################################
########### NEURAL NETS ##################
##########################################
def yoon_kim(sentence_length=200, vocab_size=30000,
n_filters=100, filters_sizes=(3, 5, 7),
embedding_dim=150, embedding_matrix=None, train_embeddings=True,
aux_input_dim=None):
"""
Compile a Keras nnet model. The returned model is a convolutional net adapted to NLP, inspired from Yoon Kim aticle.
:param sentence_length: fixed length of our truncated/padded numerical sentences.
:param vocab_size: dimension of our vocabulary set.
:param n_filters: number of kernels trained by each parallel conv layer.
:param filters_sizes: kernel sizes of each parallel conv layer.
:param embedding_dim: dimension of word vectors.
:param embedding_matrix: the initial weights to give to embedding layer.
:param train_embeddings: True if embedding layer is trainable or not.
:param aux_input_dim: dimension of an auxiliary input added in the last dense part of the nnet.
:return: the compiled keras model, ready to fit()
"""
# input
main_input = Input(shape=(sentence_length,))
# embedding
x = Embedding(vocab_size, embedding_dim, input_length=sentence_length, trainable=train_embeddings,
weights=[embedding_matrix] if embedding_matrix is not None else None)(main_input)
# Specify each convolution layer and their kernel size i.e. n-grams
conv_layers, pool_layers = [None] * len(filters_sizes), [None] * len(filters_sizes)
for i_layer, filter_size in enumerate(filters_sizes):
conv_layers[i_layer] = Conv1D(filters=n_filters, kernel_size=filter_size, activation='relu')(x)
pool_layers[i_layer] = GlobalMaxPooling1D()(conv_layers[i_layer])
# Gather all convolution layers
x = concatenate([pool for pool in pool_layers], axis=1)
# auxiliary input
if aux_input_dim:
aux_input = Input(shape=(aux_input_dim,), name='aux_input')
# merge all inputs
x = concatenate([x, aux_input])
# x = Dropout(0.1)(x)
# x = Dense(50, activation='relu')(x)
# x = Dropout(0.1)(x)
outp = Dense(6, activation='sigmoid')(x)
# build final model
model = Model(inputs=[main_input, aux_input] if aux_input_dim else main_input, outputs=outp)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# ready for .fit() !
return model
def bidirectional_lstm(sentence_length=200, vocab_size=30000,
# lstm_sizes=(60,),
embedding_dim=150, embedding_matrix=None, train_embeddings=True,
aux_input_dim=None):
"""
Compile a Keras nnet model. The returned model is a RNN with bidirectionnal LSTM layers.
:param sentence_length: fixed length of our truncated/padded numerical sentences.
:param vocab_size: dimension of our vocabulary set.
:param embedding_dim: dimension of word vectors.
:param embedding_matrix: the initial weights to give to embedding layer.
:param train_embeddings: True if embedding layer is trainable or not.
:param aux_input_dim: dimension of an auxiliary input added in the last dense part of the nnet.
:return: the compiled keras model, ready to fit()
"""
# main input (comments)
main_input = Input(shape=(sentence_length,), name='main_input')
# embedding
x = Embedding(vocab_size, embedding_dim, input_length=sentence_length, trainable=train_embeddings,
weights=[embedding_matrix] if embedding_matrix is not None else None)(main_input)
x = SpatialDropout1D(0.1)(x)
# LSTM layers
# lstm_layers = [None] * len(lstm_sizes)
# for i_layer, layer_size in enumerate(lstm_sizes):
# lstm_layers[i_layer] = Bidirectional(LSTM(layer_size, return_sequences=True))(x)
# x = lstm_layers[i_layer]
x = Bidirectional(LSTM(60, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
# pooling
max_pool = GlobalMaxPooling1D()(x)
avg_pool = GlobalAveragePooling1D()(x)
x = concatenate([max_pool, avg_pool])
# auxiliary input
if aux_input_dim:
aux_input = Input(shape=(aux_input_dim,), name='aux_input')
# merge all inputs
x = concatenate([x, aux_input])
# dense 1
x = Dense(50, activation="relu")(x)
# dropout 1
x = Dropout(0.1)(x)
# final dense
outp = Dense(6, activation="sigmoid")(x)
# build final model
model = Model(inputs=[main_input, aux_input] if aux_input_dim else main_input, outputs=outp)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# ready to .fit() !
return model
def bidir_lstm_conv(sentence_length=200, vocab_size=30000,
embedding_dim=150, embedding_matrix=None, train_embeddings=True,
aux_input_dim=None):
"""
Compile a Keras nnet model. The returned model is a mix of LSTM followed by convolutions layers.
:param sentence_length: fixed length of our truncated/padded numerical sentences.
:param vocab_size: dimension of our vocabulary set.
:param embedding_dim: dimension of word vectors.
:param embedding_matrix: the initial weights to give to embedding layer.
:param train_embeddings: True if embedding layer is trainable or not.
:param aux_input_dim: dimension of an auxiliary input added in the last dense part of the nnet.
:return: the compiled keras model, ready to fit()
"""
# main input (comments)
main_input = Input(shape=(sentence_length,), name='main_input')
# embedding
x = Embedding(vocab_size, embedding_dim, input_length=sentence_length, trainable=train_embeddings,
weights=[embedding_matrix] if embedding_matrix is not None else None)(main_input)
x = SpatialDropout1D(0.1)(x)
x = Bidirectional(LSTM(60, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(60, kernel_size=3, padding="valid")(x)
# add activation (ReLU) layer ?
# pooling
max_pool = GlobalMaxPooling1D()(x)
avg_pool = GlobalAveragePooling1D()(x)
x = concatenate([max_pool, avg_pool])
# auxiliary input
if aux_input_dim:
aux_input = Input(shape=(aux_input_dim,), name='aux_input')
# merge all inputs
x = concatenate([x, aux_input])
# dense 1
# x = Dense(50, activation="relu")(x)
# dropout 1
# x = Dropout(0.1)(x)
# final dense
outp = Dense(6, activation="sigmoid")(x)
# build final model
model = Model(inputs=[main_input, aux_input] if aux_input_dim else main_input, outputs=outp)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# ready to .fit() !
return (model)
##########################################
########### STANDARD CLASSIFIERS #########
##########################################
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0, dual=False, n_jobs=1, solver='liblinear'):
self.C = C
self.dual = dual
self.n_jobs = n_jobs
self.solver = solver
def predict(self, x):
"""
Give prediction with learned model on given data
:param x: scipy sparse matrix, (n_samples, n_features)
:return prediction: ndarray (n_samples, n_classes), binary prediction
for each class
"""
# Verify that model has been fit
check_is_fitted(self, ['_r', '_clf'])
return self._clf.predict(x.multiply(self._r))
def predict_proba(self, x):
"""
Give class probability prediction with learned model on given data
:param X: scipy sparse matrix, (n_samples, n_features)
:return prediction: ndarray (n_samples, n_classes), probability prediction
for each class given by the model
"""
# Verify that model has been fit
check_is_fitted(self, ['_r', '_clf'])
return self._clf.predict_proba(x.multiply(self._r))
def fit(self, x, y):
"""
Learn the NBSVM model on x according to y, compute prior and fit
LogisticRegression regression on biased data
:param x: scipy sparse matrix, (n_samples, n_features)
:param y: numpy array, (n_samples,)
:return model: the learned model
"""
# Check that X and y have correct shape
x, y = check_X_y(x, y, accept_sparse=True)
def pr(x, y_i, y):
idx = np.where(y == y_i)
p = x[idx].sum(0)
return (p + 1) / ((y == y_i).sum() + 1)
self._r = sparse.csr_matrix(np.log(pr(x, 1, y) / pr(x, 0, y)))
x_nb = x.multiply(self._r)
self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs, solver=self.solver).fit(x_nb, y)
return self
class OneVAllClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, n_classes, clf=NbSvmClassifier, params={}):
self.models = []
self.n_classes = n_classes
if params != {}:
for i_class in range(self.n_classes):
param_clf = {}
for (param, param_val) in params.items():
assert (len(param_val) == self.n_classes)
param_clf[param] = param_val[i_class]
self.models.append(clf(**param_clf))
else:
for i_class in range(self.n_classes):
self.models.append(clf())
def fit(self, X, y):
"""
Learn the 1vsAll model on x according to y, by fitting a model of the
specified estimator for each class.
:param x: scipy sparse matrix, (n_samples, n_features)
:param y: numpy array, (n_samples,)
:return model: the learned model
"""
assert (y.shape[1] == self.n_classes)
for i_class in range(self.n_classes):
print('Fitting model {}:'.format(i_class))
self.models[i_class].fit(X, y[:, i_class])
return self
def predict_proba(self, X):
"""
Give class probability prediction with learned model on given data
:param X: scipy sparse matrix, (n_samples, n_features)
:return prediction: ndarray (n_samples, n_classes), probability prediction
for each class given by each 1vAll model
"""
y_pred = np.ones((X.shape[0], self.n_classes))
for i_class in range(self.n_classes):
y_pred[:, i_class] = self.models[i_class].predict_proba(X)[:, 1]
return y_pred
def predict(self, X):
"""
Give prediction with learned model on given data
:param X: scipy sparse matrix, (n_samples, n_features)
:return prediction: ndarray (n_samples, n_classes), binary prediction
for each class given by each 1vAll model
"""
y_pred = np.ones((X.shape[0], self.n_classes))
for i_class in range(self.n_classes):
y_pred[:, i_class] = self.models[i_class].predict(X)
return y_pred
##########################################
########### MODEL MIX ####################
##########################################
def model_mix(y_preds, y_true):
"""
Load prediction for different models and compute optimal weights for model mix
:param y_preds: list of tuple (str, ndarray), each tuple has the name of a
model and a ndarray of predicted proba for each class
:param y_true: ndarray (n_samples, n_classes), should have the same shape as all y_preds
:return optimal_weights: weights for ponderation between model prediction,
optimized for those models
"""
names, y_pred_list = zip(*y_preds)
for i in range(len(names)):
assert (y_pred_list[i].shape == y_true.shape)
print('Prediction score on {}: {:.4f}'.format(names[i], evaluate(y_true, y_pred_list[i])))
# --------------------------------
# Find ensemble learning weights
# --------------------------------
# We want to minimize the logloss of the global prediction
def score_func(weights, func=evaluate):
import time
final_prediction = 0
for weight, prediction in zip(weights, y_pred_list):
final_prediction += weight * prediction
return -func(y_true, final_prediction)
# Uniform initialisation
init_weights = np.ones((len(y_pred_list),)) / len(y_pred_list)
print('Initial mix score: {:.4f}'.format(-score_func(init_weights)))
# Weights are in range [0; 1] and must sum to 1
constraint = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
bounds = [(0, 1)] * len(y_pred_list)
# Compute best weights (method chosen with the advice of Kaggle kernel)
res = minimize(score_func, init_weights, method='SLSQP', bounds=bounds, constraints=constraint)
optimal_weights = res['x']
print('Model mix prediction on train: {:.4f}'.format(-score_func(optimal_weights)))
return optimal_weights
def model_mix_predict(y_preds, optimal_weights):
"""
Take a list of sklearn models, weights and a dataset and return the weighted prediction
over the samples
:param X: ndarray, (n_samples, n_features), dataset to predict
:param models: list of tuple (name, model, fit_params), with model a sklearn model already trained
:param optimal_weights: list of float, weight for each model (sum(weight)==1)
:return y_pred_p: ndarray, (n_samples, n_classes), probability for each class for each sample
"""
names, y_pred_list = zip(*y_preds)
final_prediction = np.zeros(y_pred_list[0].shape)
for weight, prediction in zip(optimal_weights, y_pred_list):
final_prediction += weight * prediction
return final_prediction