-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtweak_features.py
executable file
·500 lines (407 loc) · 21.9 KB
/
tweak_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
#!/usr/bin/env python
# encoding: utf-8
"""
tweak_features.py
"""
import sys
import os
import argparse
import logging
import logging.handlers
import ast
import pandas as pd
import multiprocessing as mp
from sklearn.externals import joblib
# console logging format
CONSOLE_LOGGING_FORMAT = '%(asctime)-15s *** %(levelname)s *** %(message)s'
# file logging format
FILE_LOGGING_FORMAT = '%(asctime)-15s *** %(levelname)s [%(filename)s:%(lineno)s - %(funcName)s()] *** %(message)s'
# get the root logger
logger = logging.getLogger(__name__)
# set the logging level (default: DEBUG)
logger.setLevel(logging.DEBUG)
# create a stream handler associated with the console (stdout)
console_handler = logging.StreamHandler(sys.stdout)
# set the console handler logging format
console_logging_format = logging.Formatter(CONSOLE_LOGGING_FORMAT)
# specify the logging format for this console handler
console_handler.setFormatter(console_logging_format)
# set the logging level for this console handler (default: INFO)
console_handler.setLevel(logging.INFO)
# attach this console handler to the logger
logger.addHandler(console_handler)
# create a rotating file handler associated with an external file
file_handler = logging.handlers.RotatingFileHandler(
"tweak_features_" + str(os.getpid()) + ".log", mode='w', maxBytes=(1048576 * 5), backupCount=2, encoding=None, delay=0)
# set the file handler logging format
file_logging_format = logging.Formatter(FILE_LOGGING_FORMAT)
# specify the logging format for this file handler
file_handler.setFormatter(file_logging_format)
# set the logging level for this file handler (default: DEBUG)
file_handler.setLevel(logging.DEBUG)
# attach this file handler to the logger
logger.addHandler(file_handler)
def get_options(cmd_args=None):
"""
Parse command line arguments
"""
cmd_parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
cmd_parser.add_argument(
'dataset_filename',
help="""Path to the input file containing the original dataset (features + class labels).""",
type=str)
cmd_parser.add_argument(
'model_filename',
help="""Path to the input file containing the serialized model.""",
type=str)
cmd_parser.add_argument(
'paths_filename',
help="""Path to the input file containing the (positive) paths represented by the model.""",
type=str)
cmd_parser.add_argument(
'output_dirname',
help="""Path to the output directory containing results.""",
type=str)
cmd_parser.add_argument(
'-e',
'--epsilon',
help="""Tolerance used to pass the boolean tests encoded in each decision tree.""",
type=check_valid_epsilon,
default=0.01)
args = cmd_parser.parse_args(cmd_args)
options = {}
options['dataset_filename'] = args.dataset_filename
options['model_filename'] = args.model_filename
options['paths_filename'] = args.paths_filename
options['output_dirname'] = args.output_dirname
options['epsilon'] = args.epsilon
return options
######################## Check Input Validity ###################
def check_valid_epsilon(value):
"""
This function is responsible for checking the validity of the input threshold epsilon.
Args:
value (str): value passed as input argument to this script
Return:
a float if value is such that value > 0, an argparse.ArgumentTypeError otherwise
"""
fvalue = float(value)
if fvalue <= 0:
raise argparse.ArgumentTypeError(
"%s is an invalid value for test_split_proportion which must be any x, such that x > 0" % fvalue)
return fvalue
######################## Loading Dataset as a Pandas DataFrame object ####
def loading_dataset(input_filename, sep="\t", fillna=None):
"""
This function is responsible for loading the input dataset.
The internal representation of the dataset is a pandas.DataFrame object
Args:
input_filename (str): path to the input file containing the extracted data
sep (str): character used to delimit the input file (default = "\t")
fillna (str): replacement string for NA entries (default = None)
Return:
dataset (pandas.DataFrame): Pandas Dataframe object representing the input dataset
"""
logger.info("Loading dataset from " + input_filename +
" into Pandas DataFrame object...")
logger.info("Inferring compression from " + input_filename + " (if any)")
compression = None
if input_filename.split('.')[-1] == 'bz2':
compression = 'bz2'
if input_filename.split('.')[-1] == 'gz':
compression = 'gzip'
logger.info("Compression = " + str(compression))
dataset = pd.read_csv(input_filename, sep=sep, compression=compression)
if fillna:
logger.info(
"Replacing missing values (i.e. NA) with \"" + fillna + "\"")
dataset.fillna(fillna, inplace=True)
logger.info("Returning the dataset")
return dataset
##########################################################################
##########################################################################
def loading_model(model_filename):
"""
This function loads a model from a dump done via scikit-learn
Args:
model_filename (str): path to the file where the model has been serialized and persisted
Return:
an instance representing the trained model
"""
return joblib.load(model_filename)
##########################################################################
##########################################################################
def loading_paths(paths_filename):
"""
This function return the internal representation of (positive) paths as extracted from
the model
Args:
paths_filename (str): path to the filename containing the persisted positive paths
Return:
paths (dict): a dictionary containing a key for each decision tree of the ensemble
and for each key a list of tuple (x_i, theta_i) encoding the boolean condition
as follows:
if theta_i < 0 then the encoded condition will be feature x_i <= theta_i
if theta_i > 0 then the encoded condition will be feature x_i > theta_i
"""
paths = {}
with open(paths_filename) as paths_file:
for record in paths_file:
record = record.strip()
record = record[1:-2]
# add an extra comma at the end to be able to deal with a
# single-condition path
record = record + ','
fields = record.split(", [")
tree_id = int(fields[0])
path = list(ast.literal_eval(fields[1]))
if tree_id in paths:
paths[tree_id].append(path)
else:
paths[tree_id] = [path]
return paths
##########################################################################
############################ Retrieve True Negative Instances ############
def get_true_negatives(X, y, model):
"""
This function retrieves those instances whose class labels are really negative
and which also the model correctly predict as negative
Args:
X (pandas.DataFrame): the matrix of features (m x n)
y (pandas.Series): the vector of class labels (m x 1)
model (sklearn.ensemble): the trained (ensemble) classifier
Returns:
true_negatives (numpy.array): the list of record x_i in X whose class label y_i = -1
and whose predicted class label y_i_hat = -1 as well
"""
true_negatives = []
for i in X.index:
logger.info("Prediction for instance ID#%d" % i)
x_i = X.ix[i]
y_i = y.ix[i]
# DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19.
# Reshape your data either using X.reshape(-1, 1) if your data has a single feature or
# X.reshape(1, -1) if it contains a single sample.
y_i_hat = model.predict(x_i.reshape(1, -1))[0]
logger.info(
"True Class Label = %d; Predicted Class Label = %d" % (y_i, y_i_hat))
if y_i == y_i_hat:
logger.info(
"True and Predicted Class Labels are negative! Let's add the corresponding index to the final list")
true_negatives.append(i)
return true_negatives
##########################################################################
# Map function to compute epsilon-transformation of an instance
def map_compute_epsilon_transformation(instance):
x, n, i, epsilon, model, paths = instance
logger.info(
"Computing all the possible epsilon-transformations for instance x n. %d [ID#%d]" % (n, i))
return (i, compute_epsilon_transformations_of_instance(x, n, i, epsilon, model, paths))
##########################################################################
############### Compute epsilon-transformation of an instance ############
def compute_epsilon_transformations_of_instance(x, n, i, epsilon, model, paths):
x_transformations = {}
path_conditions = {}
tree_id = 0
logger.info("Loop through all the decision trees of the ensemble...")
for decision_tree in model.estimators_:
logger.debug(
"Check if the prediction for the instance x n. %d [ID#%d] of the base decision tree ID #%d coincides with the overall prediction of the ensemble" % (n, i, tree_id))
y_hat_dt = model.classes_[
int(decision_tree.predict(x.reshape(1, -1))[0])]
logger.debug("Class Label Prediction for x n. %d [ID#%d] according to the base decision tree ID #%d = %d" % (
n, i, tree_id, y_hat_dt))
y_hat_ensemble = model.predict(x.reshape(1, -1))
logger.debug("Class Label Prediction for x n. %d [ID#%d] according to overall ensemble = %d" % (
n, i, y_hat_ensemble))
if y_hat_dt == y_hat_ensemble:
logger.debug(
"Both class label predictions are the same (and they are both negatives)")
logger.debug(
"Get all the positive paths of the decision tree ID #%d" % tree_id)
paths_dt = paths[tree_id]
logger.info(
"Loop through all the positive paths of the decision tree ID #%d" % tree_id)
path_id = 0
for path in paths_dt:
logger.debug(
"Compute x' as the epsilon-transformation of x n. %d [ID#%d] using the positive path ID #%d (length=%d) of tree ID #%d" % (n, i, path_id, len(path), tree_id))
x_prime = compute_epsilon_transformation_path(
x, epsilon, path, path_conditions)
x_prime_class = model.classes_[
int(decision_tree.predict(x_prime.reshape(1, -1))[0])]
logger.debug(
"The predicted class label by tree ID #%d for this epsilon-transformation x' according to path ID #%d is %d" % (tree_id,
path_id, x_prime_class))
if x_prime_class != 1:
logger.warn(
"The predicted class label by tree ID #%d for this epsilon-transformation x' according to path ID #%d should be 1" % (tree_id, path_id))
else:
logger.debug(
"Check if this epsilon-transformation leads to an overall positive prediction...")
logger.debug(
"The overall predicted class label by the ensemble for this epsilon-transformation x' is %d" % (model.predict(x_prime.reshape(1, -1))[0]))
if model.predict(x_prime.reshape(1, -1))[0] == 1:
logger.info(
"This epsilon-transformation of x n. %d [ID#%d] leads to an overall positive prediction and therefore is a candidate transformation for x" % (n, i))
candidate = (path_id, len(path), x_prime)
if tree_id in x_transformations:
x_transformations[tree_id].append(candidate)
else:
x_transformations[tree_id] = [candidate]
path_id += 1
tree_id += 1
return x_transformations
##########################################################################
# Compute epsilon-transformation of an instance according to a specific
# path ##
def compute_epsilon_transformation_path(x, epsilon, path, path_conditions):
"""
This function computes the epsilon transformation of an instance x
according to the boolean conditions encoded in the specified path
Args:
x (pandas.Series): vector representing the instance
x = (x_1, x_2, ..., x_n)
epsilon (float): tolerance used to pass the tests encoded in path
path (list(tuple)): encoding of a root-to-leaf path of a decision tree as
[(0, <dir>, theta_0), ..., (n-1, <dir>, theta_{n-1})]
where each (i, <dir>, theta_i) encode a boolean condition as follows
- if <dir> = "<=" then (i, "<=", theta_i) means that the (i+1)-th feature must be less than or equal to theta_i
(x_{i+1} <= theta_i)
- if <dir> = ">" then (i, ">", theta_i) means that the (i+1)-th feature must be greater than theta_i
(x_{i+1} > theta_i)
(Note: the discrepancy of the indices derives from the fact that features are 0-based indexed on the path,
although usually they are referred using 1-based notation)
Returns:
tuple(x_prime, cost) where
x_prime (pandas.Series): a transformation of the original
instance x so that x_prime satisfies
the conditions encoded in path with an epsilon tolerance
For example, if x = (1.2, -3.7, 0.8) and path = [(0, <=, 1.5), (1, <=, -4)]
x_prime = (
1.2, -4-epsilon, 0.8)
Indeed, the first boolean condition encoded in the path states that
- (x_{0+1} <= 1.5) = (x_1 <= 1.5) Since x_1 = 1.2 this condition is already satisfied
- (x_{1+1} <= -4) = (x_2 <= -4) Since x_2 = -3.7 this value must be changed accordingly
so to satisfy the path, namely we set x_2 = -4-epsilon
- Finally, since there is no condition for x_3, we let it as it is.
"""
# Copy the original input vector using pandas.Series.copy() method
x_prime = x.copy()
logger.info("Loop through all the conditions encoded in the path")
i = 1
for cond in path:
feature = cond[0] # feature id
direction = cond[1] # condition direction (i.e. "<=" or ">")
threshold = cond[2] # condition threshold
# 1. if we already examined this condition for this instance x then
# we just retrieve the correct feature value for the transformed
# instance x'
logger.info("Check if path condition n. %d = [(%d, %s, %.5f)] has been already examined for this instance x..." % (
i, feature, direction, threshold))
if cond in path_conditions:
logger.info("Path condition n. %d = [(%d, %s, %.5f)] has been already examined for this instance x! Let's assign %s = %.5f"
% (i, feature, direction, threshold, x.index[feature], path_conditions[cond]))
x_prime[feature] = path_conditions[cond]
# 2. otherwise, we must compute the new feature value for the
# transformed instance x'
else:
logger.info("Path condition n. %d = [(%d, %s, %.5f)] has not been yet examined: Let's compute it!" % (
i, feature, direction, threshold))
# Negative Direction Case: (x_i, theta_i, <=) ==> x_i must be less than or equal
# to theta_i (x_i <= theta_i)
if direction == "<=":
logger.debug("Direction is \"%s\"" % direction)
logger.debug("Condition n. %d is about feature x_%d = %s: [(%s %s %.5f)]" % (
i, feature + 1, x.index[feature], x.index[feature], direction, threshold))
if x[feature] <= threshold:
logger.debug("Condition n. %d is already verified by x as %s = %.5f" % (
i, x.index[feature], x[feature]))
else:
logger.debug("Condition n. %d is broken by x as %s = %.5f" % (
i, x.index[feature], x[feature]))
logger.debug("Let %s = (%.5f - epsilon) = (%.5f - %.5f) = %.5f" %
(x.index[feature], threshold, threshold, epsilon, (threshold - epsilon)))
x_prime[feature] = threshold - epsilon
# Positive Direction Case: (x_i, theta_i, >) ==> x_i must be greater than
# theta_i (x_i > theta_i)
else:
logger.debug("Direction is \"%s\"" % direction)
logger.debug("Condition n. %d is about feature x_%d = %s: [(%s %s %.5f)]" % (
i, feature + 1, x.index[feature], x.index[feature], direction, threshold))
if x[feature] > threshold:
logger.debug("Condition n. %d is already verified by x as %s = %.5f" % (
i, x.index[feature], x[feature]))
else:
logger.debug("Condition n. %d is broken by x as %s = %.5f" % (
i, x.index[feature], x[feature]))
logger.debug("Let %s = (%.5f + epsilon) = (%.5f + %.5f) = %.5f" %
(x.index[feature], threshold, threshold, epsilon, (threshold + epsilon)))
x_prime[feature] = threshold + epsilon
logger.info("Eventually, let's store feature %s = %.5f just computed according to path condition n. %d"
% (x.index[feature], x_prime[feature], i))
path_conditions[cond] = x_prime[feature]
i += 1
return x_prime
##########################################################################
################# Store epsilon-transformations ##########################
def save_epsilon_transformations(X_transformations, outfilename, sep="\t", header=None):
with open(outfilename, 'w') as outfile:
out_fmt = "%d" + sep + "%d" + sep + "%d" + sep + "%d" + sep + "%s\n"
if header:
outfile.write(sep.join(header) + "\n")
for key in X_transformations:
for tree_id in sorted(X_transformations[key]):
for element in X_transformations[key][tree_id]:
path_id = element[0]
path_length = element[1]
x_prime = element[2]
outfile.write(out_fmt % (
key, tree_id, path_id, path_length, sep.join([str(x) for x in x_prime])))
##########################################################################
############################## Main ######################################
def main(options):
logger.info("Loading dataset from " + options['dataset_filename'])
# Loading dataset
dataset = loading_dataset(options['dataset_filename'])
logger.info("Loading model from " + options['model_filename'])
# Loading model
model = loading_model(options['model_filename'])
logger.info("Loading positive paths from " + options['paths_filename'])
# Loading the positive paths of the model
paths = loading_paths(options['paths_filename'])
logger.info(
"Selecting only (true) negative instances from the portion of dataset indexed")
dataset = dataset[dataset["class"] == -1]
logger.info(
"Selecting features of only (true) negative instances from the portion of dataset indexed")
# Features
X = dataset.iloc[:, :len(dataset.columns) - 1]
logger.info(
"Selecting class labels of only (true) negative instances from the portion of dataset indexed")
# Class labels
y = dataset.iloc[:, len(dataset.columns) - 1]
logger.info(
"Retrieving the list of instances whose class labels are really negatives and that are correctly predicted as negatives")
true_negatives = get_true_negatives(X, y, model)
X_negatives = X.ix[true_negatives]
logger.info("Creating the pool of workers")
pool = mp.Pool()
logger.info("Preparing the input to be sent to each worker of the pool")
X_inputs = zip(range(0, len(X_negatives.index)), X_negatives.index) # idx
inputs = [(X_negatives.ix[i], n, i, options['epsilon'], model, paths)
for (n, i) in X_inputs]
logger.info("Compute all the possible epsilon-transformations in parallel for all the true negative instances of the dataset using workers of the pool")
X_negatives_transformations = dict(
pool.map(map_compute_epsilon_transformation, inputs))
logger.info("Creating the header for the output file")
header = ['id', 'tree_id', 'path_id',
'path_length'] + list(X.columns.values)
# Save epsilon-transformations to disk
logger.info("Save the computed epsilon-transformations to %s" %
options['output_dirname'] + '/transformations_' + str(options['epsilon']) + '.tsv')
save_epsilon_transformations(X_negatives_transformations, options[
'output_dirname'] + '/transformations_' + str(options['epsilon']) + '.tsv', header=header)
if __name__ == "__main__":
sys.exit(main(get_options()))