-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreal_data_experiment.py
150 lines (106 loc) · 4.49 KB
/
real_data_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 7 13:26:02 2022
@author: Hugo Bellamy
"""
import CSV_loader as csv
from joblib import Parallel, delayed
import pickle
import os
import numpy as np
import pandas as pd
import Active_noise_def as alf
import ranking_functions as rnk
from sklearn.ensemble import RandomForestRegressor
import retest_metrics as rtm
import results_analysis as resa
import multiprocessing as mp
import time
def prediction_variance(model, X):
individual_predictions = []
for m in model.estimators_:
individual_predictions.append(m.predict(X))
y_var = np.var(individual_predictions, axis=0)
return(y_var)
def test(path):
"""
perform complete active learning experiment for a given dataset and saves
result
Parameters
----------
path : string
location of csv file to be tested
Returns
-------
None.
"""
# Load dataset
dataset = csv.data_obj('qsar_data/'+path)
X, y = dataset.total_data()
CHEMBL_name = path[:-4]
# sort paramaters for active learning
# first what is top 10%
a = y
a = 2*a
a.sort()
crit = a[int(len(a)*0.99)]/2 # this is the only line to change when going between 10% and 1%
# how many batches to perfrom
batch_n =int(len(y)/200)+1
#range of values for noise
noise_range = (np.amax(y)-np.amin(y))
noise_factors = np.linspace(0,0.25,6)
# acquistion metrics to try
methods = [rnk.greedy, rnk.random, rnk.UCB, rnk.EI, rnk.PI]
labels = ['greedy', 'random', 'UCB', 'EI', 'PI']
seeds = [658, 682, 533, 27, 889, 224, 205, 338, 559, 163]
repeats = 10
for i in noise_factors:
noise = i*noise_range
for j in range(repeats):
# first run with no retests
data = {}
for k in range(len(labels)):
base = RandomForestRegressor(100, n_jobs=-1, random_state=seeds[repeats-j-1])
AL = alf.Active_learner(base, methods[k], 100, noise,
crit_value=crit,
retest_metric=rtm.empty)
# load the data, this sets the seed used for the noise generation
AL.load_data(X,y,seeds[j])
AL.active_learn(batch_n, prediction_variance)
data[labels[k]] = AL
hits_list = resa.int_cumulative_data(data, labels)
true_hits_list = resa.int_true_cumulative_data(data, labels)
all_hits = [hits_list, true_hits_list]
fname = 'results_1%/'+CHEMBL_name+'/noR/AL_noise'+str(i)+'R'+str(j)+'.pkl'
pickle.dump(all_hits, open(fname,'wb'))
# second, repeat with retests
data2 = {}
for k in range(len(labels)):
base = RandomForestRegressor(100, n_jobs=-1, random_state=seeds[repeats-j-1])
AL2 = alf.Active_learner(base, methods[k], 100, noise,
crit_value=crit,
retest_metric=rtm.simple)
AL2.load_data(X,y, seeds[j])
AL2.active_learn(batch_n, prediction_variance)
data2[labels[k]] = AL2
hits_list = resa.int_cumulative_data(data2, labels)
true_hits_list = resa.int_true_cumulative_data(data2, labels)
all_hits = [hits_list, true_hits_list]
fname = 'results_1%/'+CHEMBL_name+'/withR/AL_noise'+str(i)+'R'+str(j)+'.pkl'
pickle.dump(all_hits, open(fname,'wb'))
def exp(name):
length = len(pd.read_csv('qsar_data/'+name))
if length>800:
CHEMBL_name = name[:-4]
print('New Dataset '+CHEMBL_name)
os.mkdir('results_1%/'+CHEMBL_name)
os.mkdir('results_1%/'+CHEMBL_name+'/noR')
os.mkdir('results_1%/'+CHEMBL_name+'/withR')
test(name)
os.remove('qsar_data/'+name)
def main():
all_files = os.listdir('qsar_data')
num_cores = mp.cpu_count()
Parallel(n_jobs=num_cores)(delayed(exp)(all_files[i]) for i in range(len(all_files)))
if __name__ == '__main__':
main()