-
Notifications
You must be signed in to change notification settings - Fork 147
/
Copy pathbase.py
292 lines (227 loc) · 10.2 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
"""
Base classes serving as design documentation.
"""
import numpy as np
class DatasetNotDownloadable(Exception):
pass
class DatasetNotPresent(Exception):
pass
class Task(object):
"""
A Task is the smallest unit of data packaging for training a machine
learning model. For different machine learning applications (semantics)
the attributes are different, but there are some conventions.
For example:
semantics='vector_classification'
- self.x is a matrix-like feature matrix with a row for each example
and a column for each feature.
- self.y is a array of labels (any type, but often integer or string)
semantics='image_classification'
- self.x is a 4D structure images x height x width x channels
- self.y is a array of labels (any type, but often integer or string)
semantics='indexed_vector_classification'
- self.all_vectors is a matrix (examples x features)
- self.all_labels is a vector of labels
- self.idxs is a vector of relevant example positions
semantics='indexed_image_classification'
- self.all_images is a 4D structure (images x height x width x channels)
- self.all_labels is a vector of labels
- self.idxs is a vector of relevant example positions
The design taken in skdata is that each data set view file defines
* a semantics object (a string in the examples above) that uniquely
*identifies* what a learning algorithm is supposed to do with the Task,
and
* documentation to *describe* to the user what a learning algorithm is
supposed to do with the Task.
As library designers, it is our hope that data set authors can re-use each
others' semantics as much as possible, so that learning algorithms are
more portable between tasks.
"""
def __init__(self, semantics=None, name=None, **kwargs):
self.semantics = semantics
self.name = name
self.__dict__.update(kwargs)
class Split(object):
"""
A Split is a (train, test) pair of Tasks with no common examples.
This class is used in cross-validation to select / learn parameters
based on the `train` task, and then to evaluate them on the `valid` task.
"""
# XXX This class is no longer necessary in the View API
def __init__(self, train, test):
self.train = train
self.test = test
class View(object):
"""
A View is an interpretation of a data set as a standard learning problem.
"""
def __init__(self, dataset=None):
"""
dataset: a reference to a low-level object that offers access to the
raw data. It is not standardized in any way, and the
reference itself is optional.
"""
self.dataset = dataset
def protocol(self, algo):
"""
Return a list of instructions for a learning algorithm.
An instruction is a 3-tuple of (attr, args, kwargs) such that
algo.<attr>(*args, **kwargs) can be interpreted by the learning algo
as a sensible operation, like train a model from some data, or test a
previously trained model.
See `LearningAlgo` below for a list of standard instructions that a
learning algorithm implementation should support, but the protocol is
left open deliberately so that new View objects can call any method
necessary on a LearningAlgo, even if it means calling a relatively
unique method that only particular LearningAlgo implementations
support.
"""
raise NotImplementedError()
class LearningAlgo(object):
"""
A base class for learning algorithms that can be driven by the protocol()
functions that are sometimes included in View subclasses.
The idea is that a protocol driver will call these methods in a particular
order with appropriate tasks, splits, etc. and a subclass of this instance
will thereby perform an experiment by side effect on `self`.
"""
def task(self, *args, **kwargs):
# XXX This is a typo right? Surely there is no reason for a
# LearningAlgo to have a self.task method...
return Task(*args, **kwargs)
def best_model(self, train, valid=None, return_promising=False):
"""
Train a model from task `train` optionally optimizing for
cross-validated performance on `valid`.
If `return_promising` is False, this function returns a tuple:
(model, train_error, valid_error)
In which
model is an opaque model for the task,
train_error is a scalar loss criterion on the training task
valid_error is a scalar loss criterion on the validation task.
If `return_promising` is True, this function returns
(model, train_error, valid_error, promising)
The `promising` term is a boolean flag indicating whether the model
seemed to work (1) or if it appeared to be degenerate (0).
"""
raise NotImplementedError('implement me')
def loss(self, model, task):
"""
Return scalar-valued training criterion of `model` on `task`.
This function can modify `self` but it should not semantically modify
`model` or `task`.
"""
raise NotImplementedError('implement me')
# -- as an example of weird methods an algo might be required to implement
# to accommodate bizarre protocols, see this one, which is required by
# LFW. Generally there is no need for this base class to list such
# special-case functions.
def retrain_classifier(self, model, train, valid=None):
"""
To the extent that `model` includes a feature extractor that is distinct from
a classifier, re-train the classifier only. This unusual step is
required in the original View1 / View2 LFW protocol. It is included
here as encouragement to add dataset-specific steps in LearningAlgo subclasses.
"""
raise NotImplementedError('implement me')
def forget_task(self, task_name):
"""
Signal that it is OK to delete any features / statistics etc related
specifically to task `task_name`. This can be safely ignored
for small data sets but deleting such intermediate results can
be crucial to keeping memory use under control.
"""
pass
class SemanticsDelegator(LearningAlgo):
def best_model(self, train, valid=None):
if valid:
assert train.semantics == valid.semantics
return getattr(self, 'best_model_' + train.semantics)(train, valid)
def loss(self, model, task):
return getattr(self, 'loss_' + task.semantics)(model, task)
class SklearnClassifier(SemanticsDelegator):
"""
Implement a LearningAlgo as much as possible in terms of an sklearn
classifier.
This class is meant to illustrate how to create an adapter between an
existing implementation of a machine learning algorithm, and the various
data sets defined in the skdata library.
Researchers are encouraged to implement their own Adapter classes
following the example of this class (i.e. cut & paste this class)
to measure the statistics they care about when handling the various
methods (e.g. best_model_vector_classification) and to save those
statistics to a convenient place. The practice of appending a summary
dictionary to the lists in self.results has proved to be useful for me,
but I don't see why it should in general be the right thing for others.
This class is also used for internal unit testing of Protocol interfaces,
so it should be free of bit rot.
"""
def __init__(self, new_model):
self.new_model = new_model
self.results = {
'best_model': [],
'loss': [],
}
def best_model_vector_classification(self, train, valid):
# TODO: use validation set if not-None
model = self.new_model()
print 'SklearnClassifier training on data set of shape', train.x.shape
model.fit(train.x, train.y)
model.trained_on = train.name
self.results['best_model'].append(
{
'train_name': train.name,
'valid_name': valid.name if valid else None,
'model': model,
})
return model
def loss_vector_classification(self, model, task):
p = model.predict(task.x)
err_rate = np.mean(p != task.y)
self.results['loss'].append(
{
'model_trained_on': model.trained_on,
'predictions': p,
'err_rate': err_rate,
'n': len(p),
'task_name': task.name,
})
return err_rate
@staticmethod
def _fallback_indexed_vector(self, task):
return Task(
name=task.name,
semantics="vector_classification",
x=task.all_vectors[task.idxs],
y=task.all_labels[task.idxs])
def best_model_indexed_vector_classification(self, train, valid):
return self.best_model_vector_classification(
self._fallback_indexed_vector(train),
self._fallback_indexed_vector(valid))
def loss_indexed_vector_classification(self, model, task):
return self.loss_vector_classification(model,
self._fallback_indexed_vector(task))
@staticmethod
def _fallback_indexed_image_task(task):
if task is None:
return None
x = task.all_images[task.idxs]
y = task.all_labels[task.idxs]
if 'int' in str(x.dtype):
x = x.astype('float32') / 255
else:
x = x.astype('float32')
x2d = x.reshape(len(x), -1)
rval = Task(
name=task.name,
semantics="vector_classification",
x=x2d,
y=y)
return rval
def best_model_indexed_image_classification(self, train, valid):
return self.best_model_vector_classification(
self._fallback_indexed_image_task(train),
self._fallback_indexed_image_task(valid))
def loss_indexed_image_classification(self, model, task):
return self.loss_vector_classification(model,
self._fallback_indexed_image_task(task))