-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathexperiment.py
109 lines (91 loc) · 4.39 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Run experiments"""
from train import train_and_evaluate
from preprocess import preprocess
import numpy as np
import utils
import json
import argparse
import config
import datetime
from init import init
from clean import clean
import time
import logging
def one_search_experiment(dataset, error_type, train_file, model, seed, n_jobs=1, hyperparams=None):
"""One experiment on the datase given an error type, a train file, a model and a random search seed
Args:
dataset (dict): dataset dict in config.py
error_type (string): error type
train_file (string): filename of training set (dirty or clean)
model (dict): ml model dict in model.py
seed (int): seed for this experiment
"""
np.random.seed(seed)
# generate random seeds for down sample and training
down_sample_seed, train_seed = np.random.randint(1000, size=2)
# load and preprocess data
X_train, y_train, X_test_list, y_test_list, test_files = \
preprocess(dataset, error_type, train_file, normalize=True, down_sample_seed=down_sample_seed)
# train and evaluate
result = train_and_evaluate(X_train, y_train, X_test_list, y_test_list, test_files, model, n_jobs=n_jobs, seed=train_seed, hyperparams=hyperparams)
return result
def one_split_experiment(dataset, n_retrain=5, seed=1, n_jobs=1, nosave=True, error_type=None):
"""Run experiments on one dataset for one split.
Args:
dataset (dict): dataset dict in config.py
models (list): list of model dict in model.py
nosave (bool): whether not save results
seed: experiment seed
n_retrain: times of repeated experiments
"""
# generate seeds for n experiments
np.random.seed(seed)
seeds = np.random.randint(10000, size=n_retrain)
# load result dict
result = utils.load_result(dataset['data_dir'])
# run experiments
for error in dataset["error_types"]:
if error_type is not None and error != error_type:
continue
for train_file in utils.get_train_files(error):
for model in config.models:
for seed in seeds:
version = utils.get_version(utils.get_dir(dataset, error, train_file))
key = "/".join((dataset['data_dir'], 'v'+str(version), error, train_file, model['name'], str(seed)))
if key in result.keys():
print("Ignore experiment {} that has been completed before.".format(key))
continue
print("{} Processing {}".format(datetime.datetime.now(), key))
res = one_search_experiment(dataset, error, train_file, model, seed, n_jobs=n_jobs)
if not nosave:
utils.save_result(dataset['data_dir'], key, res)
def experiment(datasets, log=False, n_jobs=1, nosave=False, error_type=None, arg_seeds=None):
"""Run expriments on all datasets for all splits"""
# set logger for experiments
if log:
logging.captureWarnings(False)
logging.basicConfig(filename='logging_{}.log'.format(datetime.datetime.now()), level=logging.DEBUG)
# set seeds for experiments
np.random.seed(config.root_seed)
split_seeds = np.random.randint(10000, size=config.n_resplit)
experiment_seed = np.random.randint(10000)
# run experiments
for dataset in datasets:
if log:
logging.debug("{}: Experiment on {}".format(datetime.datetime.now(), dataset['data_dir']))
for i, seed in enumerate(split_seeds):
if arg_seeds is not None:
if i not in arg_seeds:
continue
if utils.check_completed(dataset, seed, experiment_seed):
print("Ignore {}-th experiment on {} that has been completed before.".format(i, dataset['data_dir']))
continue
tic = time.time()
init(dataset, seed=seed, max_size=config.max_size)
clean(dataset, error_type)
one_split_experiment(dataset, n_retrain=config.n_retrain, n_jobs=n_jobs, nosave=nosave, seed=experiment_seed, error_type=error_type)
toc = time.time()
t = (toc - tic) / 60
remaining = t*(len(split_seeds)-i-1)
if log:
logging.debug("{}: {}-th experiment takes {} min. Estimated remaining time: {} min".format(datetime.datetime.now(), i, t, remaining))