From 1711cbeb430a51e9bf5ae7b206416b7354006117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Tue, 17 Sep 2019 17:45:36 +0200 Subject: [PATCH 01/21] WIP: pipeline --- takepod/pipeline/__init__.py | 0 takepod/pipeline/pipeline.py | 27 +++++++++++++++++++++++++++ takepod/storage/field.py | 2 +- 3 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 takepod/pipeline/__init__.py create mode 100644 takepod/pipeline/pipeline.py diff --git a/takepod/pipeline/__init__.py b/takepod/pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py new file mode 100644 index 00000000..83a0b052 --- /dev/null +++ b/takepod/pipeline/pipeline.py @@ -0,0 +1,27 @@ +from takepod.datasets import Dataset, SingleBatchIterator +from takepod.models import AbstractSupervisedModel + + +class Pipeline: + + def __init__(self, + fields, + create_example_fn, + feature_transform_fn, + model, + predict_kwargs + ): + self.model = model + self.fields = fields + self.create_example_fn = create_example_fn + self.feature_transform_fn = feature_transform_fn + self.predict_kwargs = predict_kwargs + + def predict(self, example): + processed_example = self.create_example_fn(example) + ds = Dataset([processed_example], self.fields) + + x_batch, _ = next(SingleBatchIterator(ds).__iter__()) + x = self.feature_transform_fn(x_batch) + prediction_dict = self.model.predict(x, **self.predict_kwargs) + return prediction_dict[AbstractSupervisedModel.PREDICTION_KEY] diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 786f8c85..3032890b 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -518,7 +518,7 @@ def _process_tokens(self, data, tokens): data, tokens = self._run_posttokenization_hooks(data, tokens) - if self.eager and self.use_vocab: + if self.eager and self.use_vocab and not self.vocab.finalized: self.update_vocab(data, tokens) data = data if self.store_as_raw else None From fff3eb2f519956eead22d82a8660c923c506fe08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Fri, 20 Sep 2019 13:30:13 +0200 Subject: [PATCH 02/21] Added ExampleFormat enum to ExampleFactory --- takepod/pipeline/pipeline.py | 30 ++++++++++++++++++++++++---- takepod/storage/__init__.py | 4 ++-- takepod/storage/example_factory.py | 17 +++++++++++++++- test/storage/test_example_factory.py | 24 +++++++++++++++++++++- 4 files changed, 67 insertions(+), 8 deletions(-) diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py index 83a0b052..be775463 100644 --- a/takepod/pipeline/pipeline.py +++ b/takepod/pipeline/pipeline.py @@ -1,24 +1,46 @@ +from enum import Enum, auto +import logging + +from takepod.storage import ExampleFactory, ExampleFormat from takepod.datasets import Dataset, SingleBatchIterator from takepod.models import AbstractSupervisedModel +_LOGGER = logging.getLogger(__name__) + class Pipeline: def __init__(self, fields, - create_example_fn, + example_format: ExampleFormat, feature_transform_fn, model, predict_kwargs ): + if example_format in (ExampleFormat.LIST, ExampleFormat.CSV, ExampleFormat.NLTK): + if not isinstance(fields, (list, tuple)): + error_msg = "If example format is LIST, CSV or NLTK, `fields`" \ + "must be either a list or tuple. " \ + "Type of `fields`: {}".format(type(fields)) + _LOGGER.error(error_msg) + raise TypeError(error_msg) + elif not isinstance(fields, dict): + error_msg = "If example format is DICT, XML or JSON, `fields`" \ + "must be a dict. " \ + "Type of `fields`: {}".format(type(fields)) + _LOGGER.error(error_msg) + raise TypeError(error_msg) + self.model = model self.fields = fields - self.create_example_fn = create_example_fn + self.example_format = example_format self.feature_transform_fn = feature_transform_fn self.predict_kwargs = predict_kwargs + self.example_factory = ExampleFactory(fields) - def predict(self, example): - processed_example = self.create_example_fn(example) + def predict(self, raw_example): + processed_example = self.example_factory.from_format(raw_example, + self.example_format) ds = Dataset([processed_example], self.fields) x_batch, _ = next(SingleBatchIterator(ds).__iter__()) diff --git a/takepod/storage/__init__.py b/takepod/storage/__init__.py index dd98daf4..1ce432f5 100644 --- a/takepod/storage/__init__.py +++ b/takepod/storage/__init__.py @@ -1,6 +1,6 @@ """Package contains modules for storing and loading datasets and vectors.""" -from .example_factory import ExampleFactory +from .example_factory import ExampleFactory, ExampleFormat from .field import Field, TokenizedField, MultilabelField, MultioutputField, unpack_fields from .resources.downloader import (BaseDownloader, SCPDownloader, HttpDownloader, SimpleHttpDownloader) @@ -22,4 +22,4 @@ "Field", "TokenizedField", "MultilabelField", "MultioutputField", "unpack_fields", "LargeResource", "SCPLargeResource", "VectorStorage", "BasicVectorStorage", "SpecialVocabSymbols", "Vocab", - "ExampleFactory", "TfIdfVectorizer"] + "ExampleFactory", "ExampleFormat", "TfIdfVectorizer"] diff --git a/takepod/storage/example_factory.py b/takepod/storage/example_factory.py index f5766712..148e3b2a 100644 --- a/takepod/storage/example_factory.py +++ b/takepod/storage/example_factory.py @@ -4,6 +4,7 @@ import logging import json import csv +from enum import Enum, auto import xml.etree.ElementTree as ET from takepod.storage.field import unpack_fields @@ -11,6 +12,15 @@ _LOGGER = logging.getLogger(__name__) +class ExampleFormat(Enum): + LIST = lambda data, factory: factory.from_list(data) + DICT = lambda data, factory: factory.from_dict(data) + CSV = lambda data, factory: factory.from_csv(data) + NLTK = lambda data, factory: factory.from_fields_tree(data) + XML = lambda data, factory: factory.from_xml_str(data) + JSON = lambda data, factory: factory.from_json(data) + + class Example: """Method models one example with fields that hold (raw, tokenized) values and special fields with "_" @@ -152,7 +162,7 @@ def from_xml_str(self, data): node = root else: error_msg = "Specified name {} was not found in the " \ - "input data".format(name) + "input data".format(name) _LOGGER.error(error_msg) raise ValueError(error_msg) @@ -251,6 +261,11 @@ def from_fields_tree(self, data, subtrees=False): else: return self.from_list(tree_to_list(tree)) + def from_format(self, + data, + format: ExampleFormat): + return format(data, self) + def tree_to_list(tree): """Method joins tree leaves and label in one list. diff --git a/test/storage/test_example_factory.py b/test/storage/test_example_factory.py index 367a1401..b23fb3ac 100644 --- a/test/storage/test_example_factory.py +++ b/test/storage/test_example_factory.py @@ -1,6 +1,6 @@ import pytest -from takepod.storage import ExampleFactory, Field +from takepod.storage import ExampleFactory, Field, ExampleFormat name_field = Field("Name", store_as_raw=True, @@ -397,3 +397,25 @@ def test_cache_data_field_from_dict(expected_values): assert hasattr(example, field_name) assert hasattr(example, "{}_".format(field_name)) + + +def test_from_format(): + list_example_factory = ExampleFactory(field_list) + + list_data = ["Mark Dark", 5, "Hawaiian pizza"] + example = list_example_factory.from_format(list_data, ExampleFormat.LIST) + + assert example.Name[0] == list_data[0] + assert example.Score[0] == list_data[1] + assert example.Favorite_food[0] == list_data[2] + + dict_example_factory = ExampleFactory(field_dict) + dict_data = {"Name": "Mark Dark", + "Score": 5, + "Favorite_food": "Hawaiian pizza"} + + example = dict_example_factory.from_format(dict_data, ExampleFormat.DICT) + assert example.Name[0] == dict_data["Name"] + assert example.Score[0] == dict_data["Score"] + assert example.Favorite_food[0] == dict_data["Favorite_food"] + # TODO extend testing to other formats? From 51604aaf12a2850aedb72213439d45a9308426b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Fri, 20 Sep 2019 15:26:20 +0200 Subject: [PATCH 03/21] Implemented FeatureTransformer --- takepod/models/__init__.py | 3 ++- takepod/models/transformers.py | 40 ++++++++++++++++++++++++++++++++++ takepod/pipeline/pipeline.py | 8 +++---- 3 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 takepod/models/transformers.py diff --git a/takepod/models/__init__.py b/takepod/models/__init__.py index e367f31f..4d7f71cf 100644 --- a/takepod/models/__init__.py +++ b/takepod/models/__init__.py @@ -4,7 +4,8 @@ from .batch_transform_functions import default_feature_transform, default_label_transform from .experiment import Experiment from .trainer import AbstractTrainer +from .transformers import FeatureTransformer __all__ = ["AbstractFrameworkModel", "AbstractSupervisedModel", "default_feature_transform", "default_label_transform", "Experiment", - "AbstractTrainer"] + "AbstractTrainer", "FeatureTransformer"] diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py new file mode 100644 index 00000000..bdd5a850 --- /dev/null +++ b/takepod/models/transformers.py @@ -0,0 +1,40 @@ +from abc import ABC, abstractmethod +from typing import Callable, NamedTuple + +import numpy as np + + +class TensorTransformer(ABC): + + @abstractmethod + def fit(self, + x: np.ndarray, + y: np.ndarray): + pass + + @abstractmethod + def transform(self, + x: np.array + ) -> np.ndarray: + pass + + +class FeatureTransformer: + + def __init__(self, + feature_extraction_fn: Callable[[NamedTuple], np.ndarray], + tensor_transformer: TensorTransformer): + self.feature_extraction_fn = feature_extraction_fn + self.tensor_transform = tensor_transformer + + def fit(self, + x: NamedTuple, + y: np.ndarray): + x_tensor = self.feature_extraction_fn(x) + self.tensor_transform.fit(x_tensor, y) + + def transform(self, + x: NamedTuple) -> np.ndarray: + x_tensor = self.feature_extraction_fn(x) + self.tensor_transform.transform(x_tensor) + diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py index be775463..478b1274 100644 --- a/takepod/pipeline/pipeline.py +++ b/takepod/pipeline/pipeline.py @@ -3,7 +3,7 @@ from takepod.storage import ExampleFactory, ExampleFormat from takepod.datasets import Dataset, SingleBatchIterator -from takepod.models import AbstractSupervisedModel +from takepod.models import AbstractSupervisedModel, FeatureTransformer _LOGGER = logging.getLogger(__name__) @@ -13,7 +13,7 @@ class Pipeline: def __init__(self, fields, example_format: ExampleFormat, - feature_transform_fn, + feature_transformer: FeatureTransformer, model, predict_kwargs ): @@ -34,7 +34,7 @@ def __init__(self, self.model = model self.fields = fields self.example_format = example_format - self.feature_transform_fn = feature_transform_fn + self.feature_transformer = feature_transformer self.predict_kwargs = predict_kwargs self.example_factory = ExampleFactory(fields) @@ -44,6 +44,6 @@ def predict(self, raw_example): ds = Dataset([processed_example], self.fields) x_batch, _ = next(SingleBatchIterator(ds).__iter__()) - x = self.feature_transform_fn(x_batch) + x = self.feature_transformer.transform(x_batch) prediction_dict = self.model.predict(x, **self.predict_kwargs) return prediction_dict[AbstractSupervisedModel.PREDICTION_KEY] From 849386d630cab14fdb7f8e4418af465a4e180b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 23 Sep 2019 17:25:45 +0200 Subject: [PATCH 04/21] Made podium use FeatureTransformer --- takepod/models/__init__.py | 2 +- takepod/models/experiment.py | 25 ++++--- takepod/models/impl/simple_trainers.py | 4 +- takepod/models/trainer.py | 7 +- takepod/models/transformers.py | 32 +++++++-- test/models/test_experiment.py | 92 ++++++++++++++++++-------- test/models/test_simple_trainers.py | 11 +-- 7 files changed, 123 insertions(+), 50 deletions(-) diff --git a/takepod/models/__init__.py b/takepod/models/__init__.py index 4d7f71cf..71f5ba5d 100644 --- a/takepod/models/__init__.py +++ b/takepod/models/__init__.py @@ -2,9 +2,9 @@ from .model import AbstractFrameworkModel, AbstractSupervisedModel from .batch_transform_functions import default_feature_transform, default_label_transform +from .transformers import FeatureTransformer from .experiment import Experiment from .trainer import AbstractTrainer -from .transformers import FeatureTransformer __all__ = ["AbstractFrameworkModel", "AbstractSupervisedModel", "default_feature_transform", "default_label_transform", "Experiment", diff --git a/takepod/models/experiment.py b/takepod/models/experiment.py index 560e5983..662030a2 100644 --- a/takepod/models/experiment.py +++ b/takepod/models/experiment.py @@ -6,8 +6,8 @@ from takepod.datasets.dataset import Dataset from takepod.datasets.iterator import Iterator, SingleBatchIterator -from takepod.models import AbstractSupervisedModel,\ - default_feature_transform, default_label_transform +from takepod.models import AbstractSupervisedModel, \ + default_feature_transform, default_label_transform, FeatureTransformer from takepod.models.trainer import AbstractTrainer @@ -19,8 +19,7 @@ def __init__(self, trainer: AbstractTrainer, training_iterator_callable: Callable[[Dataset], Iterator], prediction_iterator_callable: Callable[[Dataset], Iterator] = None, - feature_transform_fun: - Callable[[NamedTuple], np.ndarray] = None, + feature_transformer: FeatureTransformer = None, label_transform_fun: Callable[[NamedTuple], np.ndarray] = None ): @@ -56,6 +55,7 @@ def __init__(self, the prediction result of the model for some examples must be identical to the result of this callable for those same examples. """ + # TODO update docs to account for FeatureTransformer self.model_class = model_class self.model = None self.trainer = trainer @@ -72,9 +72,9 @@ def default_prediction_iterator_callable(dataset): else: self.prediction_iterator_callable = prediction_iterator_callable - self.feature_transform_fun = feature_transform_fun \ - if feature_transform_fun is not None \ - else default_feature_transform + self.feature_transformer = feature_transformer \ + if feature_transformer is not None \ + else FeatureTransformer(default_feature_transform) self.label_transform_fun = label_transform_fun \ if label_transform_fun is not None \ @@ -136,13 +136,20 @@ def fit(self, trainer_args = self.default_trainer_args.copy() trainer_args.update(trainer_kwargs) + # Fit the feature transformer if it needs2 fitting + if self.feature_transformer.requires_fitting(): + x_batch, y_batch = next(SingleBatchIterator(dataset).__iter__()) + y = self.label_transform_fun(y_batch) + self.feature_transformer.fit(x_batch, y) + # Create new model instance self.model = self.model_class(**model_args) train_iterator = self.training_iterator_callable(dataset) + # Train the model self.trainer.train(self.model, train_iterator, - self.feature_transform_fun, + self.feature_transformer, self.label_transform_fun, **trainer_args) @@ -171,7 +178,7 @@ def predict(self, y = [] for x_batch, _ in self.prediction_iterator_callable(dataset): - x_batch_tensor = self.feature_transform_fun(x_batch) + x_batch_tensor = self.feature_transformer.transform(x_batch) batch_prediction = self.model.predict(x_batch_tensor, **kwargs) prediction_tensor = batch_prediction[AbstractSupervisedModel.PREDICTION_KEY] y.append(prediction_tensor) diff --git a/takepod/models/impl/simple_trainers.py b/takepod/models/impl/simple_trainers.py index 2de8617d..947e3d4d 100644 --- a/takepod/models/impl/simple_trainers.py +++ b/takepod/models/impl/simple_trainers.py @@ -16,13 +16,13 @@ class SimpleTrainer(AbstractTrainer): def train(self, model, iterator, - feature_transform_fun, + feature_transformer, label_transform_fun, **kwargs): self._check_kwargs(**kwargs) for _ in range(kwargs[SimpleTrainer.MAX_EPOCH_KEY]): for x_batch, y_batch in iterator: - x = feature_transform_fun(x_batch) + x = feature_transformer.transform(x_batch) y = label_transform_fun(y_batch) model.fit(X=x, y=y) diff --git a/takepod/models/trainer.py b/takepod/models/trainer.py index e1c71cdc..5c8ec39a 100644 --- a/takepod/models/trainer.py +++ b/takepod/models/trainer.py @@ -4,7 +4,7 @@ import numpy as np -from takepod.models import AbstractSupervisedModel +from takepod.models import AbstractSupervisedModel, FeatureTransformer from takepod.datasets import Iterator @@ -15,8 +15,7 @@ class AbstractTrainer(ABC): def train(self, model: AbstractSupervisedModel, iterator: Iterator, - feature_transform_fun: - Callable[[NamedTuple], np.ndarray], + feature_transformer: FeatureTransformer, label_transform_fun: Callable[[NamedTuple], np.ndarray], **kwargs): @@ -27,7 +26,7 @@ def train(self, The model that needs to be trained. iterator : Iterator Iterator instance that provides data from a dataset - feature_transform_fun: Callable[[NamedTuple], np.ndarray] + feature_transformer: Callable[[NamedTuple], np.ndarray] Callable that transforms the input part of the batch returned by the iterator into features that can be fed into the model. label_transform_fun: Callable[[NamedTuple], np.ndarray] diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index bdd5a850..0b5cc6ce 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -19,22 +19,46 @@ def transform(self, pass +class DummyTensorTransformer(TensorTransformer): + + def fit(self, + x: np.ndarray, + y: np.ndarray): + pass + + def transform(self, + x: np.array + ) -> np.ndarray: + return x + + +# TODO add mechanism for Feature transformer to know if its tensor_transformer needs +# fitting so batching can be avoided by callers. class FeatureTransformer: def __init__(self, feature_extraction_fn: Callable[[NamedTuple], np.ndarray], - tensor_transformer: TensorTransformer): + tensor_transformer: TensorTransformer = None): self.feature_extraction_fn = feature_extraction_fn - self.tensor_transform = tensor_transformer + self.tensor_transformer = tensor_transformer def fit(self, x: NamedTuple, y: np.ndarray): + if not self.requires_fitting(): + return + x_tensor = self.feature_extraction_fn(x) - self.tensor_transform.fit(x_tensor, y) + self.tensor_transformer.fit(x_tensor, y) def transform(self, x: NamedTuple) -> np.ndarray: x_tensor = self.feature_extraction_fn(x) - self.tensor_transform.transform(x_tensor) + if self.tensor_transformer is None: + return x_tensor + + else: + self.tensor_transformer.transform(x_tensor) + def requires_fitting(self): + return self.tensor_transformer is not None diff --git a/test/models/test_experiment.py b/test/models/test_experiment.py index 40065431..37f3927d 100644 --- a/test/models/test_experiment.py +++ b/test/models/test_experiment.py @@ -1,22 +1,71 @@ from collections import namedtuple +import pytest import numpy as np from takepod.models import AbstractSupervisedModel, Experiment +from takepod.datasets import Dataset, Iterator +from takepod.storage import Field, ExampleFactory, Vocab +@pytest.fixture +def dataset(): + data = [{"Name": "Mark Dark", + "Score": 5}, + {"Name": "Stephen Smith", + "Score": 10}, + {"Name": "Ann Mann", + "Score": 15}] + + name_field = Field("Name", + vocab=Vocab(), + store_as_raw=True, + tokenizer="split") + + score_field = Field("Score", + custom_numericalize=int, + tokenize=False, + is_target=True) + + fields = {"Name": name_field, + "Score": score_field} + + example_factory = ExampleFactory(fields) + examples = [example_factory.from_dict(data_) for data_ in data] + + ds = Dataset(examples, fields) + ds.finalize_fields() + return ds + +def MockDataset(): + pass + def mock_feature_transform_fun(x_batch): - return x_batch.input + return x_batch.Score def mock_label_transform_fun(y_batch): - return y_batch.output + return y_batch.Score -class MockDataset: - pass +class MockTransformer: + + def __init__(self, to_fit): + self.to_fit = to_fit + self.fit_called = 0 + + def fit(self, x, y): + self.fit_called += 1 + pass + + def transform(self, x_batch): + return mock_feature_transform_fun(x_batch) + def requires_fitting(self): + return self.to_fit -def test_experiment_train(): + +@pytest.mark.parametrize("fit_transformer", (False, True)) +def test_experiment_train(dataset, fit_transformer): default_model_args = { 'm_arg1': 1, 'm_arg2': 2 @@ -49,22 +98,9 @@ def test_experiment_train(): 't_arg3': 4 } - class MockIterator: - input_batch_class = namedtuple("input_batch_class", ["input"]) - output_batch_class = namedtuple("output_batch_class", ["output"]) - - def __iter__(self): - x = np.array( - [ - [1, 2], - [3, 4] - ]) - - y = np.array([5, 6]) + mock_transformer = MockTransformer(fit_transformer) - input_batch = self.input_batch_class(input=x) - target_batch = self.output_batch_class(output=y) - yield input_batch, target_batch + my_iterator = Iterator(dataset) class MockModel: def __init__(self, **kwargs): @@ -78,12 +114,12 @@ def __init__(self): def train(self, model, iterator, - feature_transform_fun, + feature_transformer, label_transform_fun, **kwargs): assert isinstance(model, MockModel) - assert isinstance(iterator, MockIterator) - assert feature_transform_fun is mock_feature_transform_fun + assert iterator is my_iterator + assert feature_transformer is mock_transformer assert label_transform_fun is mock_label_transform_fun assert kwargs == expected_trainer_args self.train_called += 1 @@ -92,18 +128,22 @@ def train(self, experiment = Experiment(MockModel, trainer, - lambda _: MockIterator(), - feature_transform_fun=mock_feature_transform_fun, + lambda _: my_iterator, + feature_transformer=mock_transformer, label_transform_fun=mock_label_transform_fun) experiment.set_default_model_args(**default_model_args) experiment.set_default_trainer_args(**default_trainer_args) - experiment.fit(MockDataset(), + experiment.fit(dataset, model_args, trainer_args) assert trainer.train_called == 1 + if fit_transformer: + assert mock_transformer.fit_called == 1 + else: + assert mock_transformer.fit_called == 0 def test_experiment_predict(): diff --git a/test/models/test_simple_trainers.py b/test/models/test_simple_trainers.py index 9145c10a..92d1fa7c 100644 --- a/test/models/test_simple_trainers.py +++ b/test/models/test_simple_trainers.py @@ -3,6 +3,7 @@ from takepod.models.impl.simple_trainers import SimpleTrainer from takepod.models.model import AbstractSupervisedModel +from takepod.models import FeatureTransformer from takepod.datasets.iterator import Iterator from test.storage.conftest import (tabular_dataset, json_file_path) # noqa @@ -20,7 +21,7 @@ def test_simple_trainer_no_num_epoch(tabular_dataset, model): trainer = SimpleTrainer() trainer.train(model, iterator=iterator, - feature_transform_fun=lambda x: x, + feature_transformer=lambda x: x, label_transform_fun=lambda y: y) @@ -29,9 +30,10 @@ def test_simple_trainer_num_epoch(tabular_dataset, model): tabular_dataset.finalize_fields() iterator = Iterator(tabular_dataset, batch_size=len(tabular_dataset)) trainer = SimpleTrainer() + feature_transformer = FeatureTransformer(lambda x: x) trainer.train(model=model, iterator=iterator, - feature_transform_fun=lambda x: x, + feature_transformer=feature_transformer, label_transform_fun=lambda y: y, **{trainer.MAX_EPOCH_KEY: 10}) assert model.fit.call_count == 10 @@ -42,7 +44,7 @@ def mock_feature_transform_fun(x): def mock_label_transform_fun(y): - y + return y @pytest.mark.usefixtures("tabular_dataset", "mocker", "model") # noqa def test_simple_trainer_batch_transform_call(tabular_dataset, mocker, model): @@ -55,11 +57,12 @@ def test_simple_trainer_batch_transform_call(tabular_dataset, mocker, model): with mocker.patch( "test.models.test_simple_trainers.mock_label_transform_fun", return_value=next(iterator.__iter__())[1]): + feature_transformer = FeatureTransformer(mock_feature_transform_fun) trainer = SimpleTrainer() trainer.train( model=model, iterator=iterator, - feature_transform_fun=mock_feature_transform_fun, + feature_transformer=feature_transformer, label_transform_fun=mock_label_transform_fun, **{trainer.MAX_EPOCH_KEY: 10}) assert mock_feature_transform_fun.call_count == 10 # pylint: disable=E1101 From 43b972a36c745f61ad4889143324ea0716416737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Tue, 24 Sep 2019 15:32:17 +0200 Subject: [PATCH 05/21] WIP --- takepod/models/experiment.py | 60 +++++++++++++++++++++++---- takepod/models/transformers.py | 6 ++- takepod/pipeline/pipeline.py | 76 ++++++++++++++++++++++++++++++++-- 3 files changed, 129 insertions(+), 13 deletions(-) diff --git a/takepod/models/experiment.py b/takepod/models/experiment.py index 662030a2..d5e29523 100644 --- a/takepod/models/experiment.py +++ b/takepod/models/experiment.py @@ -1,6 +1,8 @@ """Modules defines an experiment - class used to combine iteration over data, model training and prediction.""" -from typing import Callable, NamedTuple, Dict, Type +from typing import Callable, NamedTuple, Dict, Type, Union +from inspect import isclass +import logging import numpy as np @@ -10,12 +12,14 @@ default_feature_transform, default_label_transform, FeatureTransformer from takepod.models.trainer import AbstractTrainer +_LOGGER = logging.getLogger(__name__) + class Experiment: """Class used to streamline model fitting and prediction.""" def __init__(self, - model_class: Type[AbstractSupervisedModel], + model: Union[Type[AbstractSupervisedModel], AbstractSupervisedModel], trainer: AbstractTrainer, training_iterator_callable: Callable[[Dataset], Iterator], prediction_iterator_callable: Callable[[Dataset], Iterator] = None, @@ -28,8 +32,11 @@ def __init__(self, Parameters ---------- - model_class : class - Class of the Model to be fitted. + model : class or model instance + Class of the Model to be fitted or a pre-trained model. + If pre-trained model is passed and `fit` is called a new model instance will + be created. For fine-tuning of the passed model instance call + `partial_fit`. Must be a subclass of Podium's `AbstractSupervisedModel` trainer : AbstractTrainer @@ -56,8 +63,13 @@ def __init__(self, result of this callable for those same examples. """ # TODO update docs to account for FeatureTransformer - self.model_class = model_class - self.model = None + if isclass(model): + self.model_class = model + self.model = None + else: + self.model_class = model.__class__ + self.model = model + self.trainer = trainer self.training_iterator_callable = training_iterator_callable @@ -127,6 +139,7 @@ def fit(self, default arguments defined with `set_default_trainer_args` updated/overridden by 'trainer_kwargs'. """ + model_kwargs = {} if model_kwargs is None else model_kwargs trainer_kwargs = {} if trainer_kwargs is None else trainer_kwargs @@ -136,7 +149,7 @@ def fit(self, trainer_args = self.default_trainer_args.copy() trainer_args.update(trainer_kwargs) - # Fit the feature transformer if it needs2 fitting + # Fit the feature transformer if it needs fitting if self.feature_transformer.requires_fitting(): x_batch, y_batch = next(SingleBatchIterator(dataset).__iter__()) y = self.label_transform_fun(y_batch) @@ -153,6 +166,31 @@ def fit(self, self.label_transform_fun, **trainer_args) + def partial_fit(self, + dataset: Dataset, + trainer_kwargs: Dict = None, + trainer: AbstractTrainer = None, + training_iterator_callable: Callable[[Dataset], Iterator] = None): + self.check_if_model_exists() + + trainer = trainer if trainer is not None else self.trainer + + trainer_kwargs = {} if trainer_kwargs is None else trainer_kwargs + trainer_args = self.default_trainer_args.copy() + trainer_args.update(trainer_kwargs) + + training_iterator_callable = training_iterator_callable \ + if training_iterator_callable is not None \ + else self.training_iterator_callable + + iterator = training_iterator_callable(dataset) + + trainer.train(self.model, + iterator, + self.feature_transformer, + self.label_transform_fun, + **trainer_args) + def predict(self, dataset: Dataset, **kwargs @@ -174,6 +212,7 @@ def predict(self, """ # TODO: new method of providing examples must be defined. # examples is taken in dataset form as proof-of-concept. + self.check_if_model_exists() y = [] @@ -184,3 +223,10 @@ def predict(self, y.append(prediction_tensor) return np.concatenate(y) + + def check_if_model_exists(self): + if self.model is None: + errmsg = "Model instance not available. Please provide a model instance in " \ + "the constructor or call `fit` before calling `partial_fit.`" + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index 0b5cc6ce..bebb843f 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -38,9 +38,11 @@ class FeatureTransformer: def __init__(self, feature_extraction_fn: Callable[[NamedTuple], np.ndarray], - tensor_transformer: TensorTransformer = None): + tensor_transformer: TensorTransformer = None, + requires_fitting=True): self.feature_extraction_fn = feature_extraction_fn self.tensor_transformer = tensor_transformer + self.requires_fitting_flag = requires_fitting def fit(self, x: NamedTuple, @@ -61,4 +63,4 @@ def transform(self, self.tensor_transformer.transform(x_tensor) def requires_fitting(self): - return self.tensor_transformer is not None + return self.tensor_transformer is not None and self.requires_fitting_flag diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py index 478b1274..734aae9d 100644 --- a/takepod/pipeline/pipeline.py +++ b/takepod/pipeline/pipeline.py @@ -1,9 +1,9 @@ -from enum import Enum, auto +from typing import Union, Dict, List import logging from takepod.storage import ExampleFactory, ExampleFormat from takepod.datasets import Dataset, SingleBatchIterator -from takepod.models import AbstractSupervisedModel, FeatureTransformer +from takepod.models import AbstractSupervisedModel, FeatureTransformer, Experiment _LOGGER = logging.getLogger(__name__) @@ -11,11 +11,16 @@ class Pipeline: def __init__(self, - fields, + fields: Union[Dict, List], example_format: ExampleFormat, feature_transformer: FeatureTransformer, model, - predict_kwargs + predict_kwargs=None, + trainer=None, + trainer_kwargs=None, + trainer_iterator=None, + model_args=None, + label_transform_fn=None ): if example_format in (ExampleFormat.LIST, ExampleFormat.CSV, ExampleFormat.NLTK): if not isinstance(fields, (list, tuple)): @@ -38,6 +43,12 @@ def __init__(self, self.predict_kwargs = predict_kwargs self.example_factory = ExampleFactory(fields) + self.trainer = trainer + self.trainer_kwargs = trainer_kwargs + self.trainer_iterator = trainer_iterator + self.model_args = model_args + self.label_transform_fn = label_transform_fn + def predict(self, raw_example): processed_example = self.example_factory.from_format(raw_example, self.example_format) @@ -47,3 +58,60 @@ def predict(self, raw_example): x = self.feature_transformer.transform(x_batch) prediction_dict = self.model.predict(x, **self.predict_kwargs) return prediction_dict[AbstractSupervisedModel.PREDICTION_KEY] + + def fit(self, + dataset: Dataset, + trainer=None, + trainer_iterator=None, + trainer_kwargs=None, + model_kwargs=None, + reset_model=True): + trainer = trainer if trainer is not None else self.trainer + if trainer is None: + errmsg = "No trainer provided. Trainer must be provided either in the " \ + "constructor or as an argument." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) + + trainer_kwargs = trainer_kwargs if trainer_kwargs is not None \ + else self.trainer_kwargs + if trainer_kwargs is None: + errmsg = "No trainer_kwargs provided. Trainer arguments must be provided " \ + "either in the constructor or as an argument. If no arguments are " \ + "necessary, please pass an empty dict." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) + + model_kwargs = model_kwargs if model_kwargs is not None \ + else self.model_kwargs + if model_kwargs is None: + errmsg = "No model_kwargs provided. Model arguments must be provided " \ + "either in the constructor or as an argument. If no arguments are " \ + "necessary, please pass an empty dict." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) + + trainer_iterator = trainer_iterator if trainer_iterator is not None \ + else self.trainer_iterator + if trainer_iterator is None: + errmsg = "No trainer_iterator provided. Trainer_iterator must be provided " \ + "either in the constructor or as an argument. If no arguments are " \ + "necessary, please pass an empty dict." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) + + experiment = Experiment(self.model, + trainer, + lambda ds: trainer_iterator.set_dataset(ds), + feature_transformer=self.feature_transformer, + label_transform_fun=self.label_transform_fn) + if reset_model: + experiment.fit(dataset, + model_kwargs, + trainer_kwargs) + else: + experiment.partial_fit(dataset, + trainer_kwargs) + + self.model = experiment.model + self. From 4522b44061cdf9698faf3abd440169826300653d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 25 Sep 2019 21:22:22 +0200 Subject: [PATCH 06/21] implemented pipeline --- takepod/examples/experiment_example.py | 14 ++-- takepod/models/experiment.py | 76 +++++++++++++----- takepod/pipeline/pipeline.py | 102 +++++++++---------------- test/models/test_experiment.py | 6 +- 4 files changed, 101 insertions(+), 97 deletions(-) diff --git a/takepod/examples/experiment_example.py b/takepod/examples/experiment_example.py index 0111927f..0d1829e4 100644 --- a/takepod/examples/experiment_example.py +++ b/takepod/examples/experiment_example.py @@ -9,7 +9,7 @@ from takepod.datasets.impl.pauza_dataset import PauzaHRDataset from takepod.models.impl.fc_model import ScikitMLPClassifier from takepod.models.impl.simple_trainers import SimpleTrainer -from takepod.models import Experiment +from takepod.models import Experiment, FeatureTransformer from takepod.validation import k_fold_classification_metrics from takepod.model_selection import grid_search from sklearn.metrics import accuracy_score @@ -70,15 +70,15 @@ def train_iterator_provider(dataset): embedding_matrix = vectorizer.get_embedding_matrix( fields["Text"].vocab) - feature_transform = partial(feature_transform_mean_fun, + feature_transform_fn = partial(feature_transform_mean_fun, embedding_matrix=embedding_matrix) + feature_transformer = FeatureTransformer(feature_transform_fn) experiment = Experiment(ScikitMLPClassifier, - trainer, - train_iterator_provider, - None, - feature_transform, - label_transform_fun) + trainer=trainer, + training_iterator_callable=train_iterator_provider, + feature_transformer=feature_transformer, + label_transform_fun=label_transform_fun) _, model_params, train_params = \ grid_search(experiment, diff --git a/takepod/models/experiment.py b/takepod/models/experiment.py index d5e29523..325badbf 100644 --- a/takepod/models/experiment.py +++ b/takepod/models/experiment.py @@ -20,10 +20,10 @@ class Experiment: def __init__(self, model: Union[Type[AbstractSupervisedModel], AbstractSupervisedModel], - trainer: AbstractTrainer, - training_iterator_callable: Callable[[Dataset], Iterator], - prediction_iterator_callable: Callable[[Dataset], Iterator] = None, feature_transformer: FeatureTransformer = None, + trainer: AbstractTrainer = None, + training_iterator_callable: Callable[[Dataset], Iterator] = None, + prediction_iterator_callable: Callable[[Dataset], Iterator] = None, label_transform_fun: Callable[[NamedTuple], np.ndarray] = None ): @@ -52,9 +52,10 @@ def __init__(self, a single tensor before being returned. If passed None, a SingleBatchIterator will be used as a default. - feature_transform_fun : Callable[[NamedTuple], np.ndarray] - Callable that transforms the input part of the batch returned by the iterator - into features that can be fed into the model. + feature_transformer : FeatureTransformer + FeatureTransformer that transforms the input part of the batch returned by the + iterator into features that can be fed into the model. Will also be fitted + during Experiment fitting. label_transform_fun : Callable[[NamedTuple], np.ndarray] Callable that transforms the target part of the batch returned by the iterator @@ -62,7 +63,6 @@ def __init__(self, the prediction result of the model for some examples must be identical to the result of this callable for those same examples. """ - # TODO update docs to account for FeatureTransformer if isclass(model): self.model_class = model self.model = None @@ -119,7 +119,10 @@ def set_default_trainer_args(self, **kwargs): def fit(self, dataset: Dataset, model_kwargs: Dict = None, - trainer_kwargs: Dict = None + trainer_kwargs: Dict = None, + feature_transformer: FeatureTransformer = None, + trainer: AbstractTrainer = None, + training_iterator_callable: Callable[[Dataset], Iterator] = None, ): """Fits the model to the provided Dataset. During fitting, the provided Iterator and Trainer are used. @@ -138,6 +141,22 @@ def fit(self, Dict containing trainer arguments. Arguments passed to the trainer are the default arguments defined with `set_default_trainer_args` updated/overridden by 'trainer_kwargs'. + + feature_transformer : FeatureTransformer, Optional + FeatureTransformer that transforms the input part of the batch returned by the + iterator into features that can be fed into the model. Will also be fitted + during Experiment fitting. + If None, the default FeatureTransformer provided in the constructor will be + used. Otherwise, this will overwrite the default feature transformer. + + trainer : AbstractTrainer, Optional + Trainer used to fit the model. If None, the trainer provided in the + constructor will be used. + + training_iterator_callable: Callable[[Dataset], Iterator] + Callable used to instantiate new instances of the Iterator used in fitting the + model. If None, the training_iterator_callable provided in the + constructor will be used. """ model_kwargs = {} if model_kwargs is None else model_kwargs @@ -149,6 +168,16 @@ def fit(self, trainer_args = self.default_trainer_args.copy() trainer_args.update(trainer_kwargs) + trainer = trainer if trainer is not None else self.trainer + if trainer is None: + errmsg = "No trainer provided. Trainer must be provided either in the " \ + "constructor or as an argument to the fit method." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) + + if feature_transformer is not None: + self.feature_transformer = feature_transformer + # Fit the feature transformer if it needs fitting if self.feature_transformer.requires_fitting(): x_batch, y_batch = next(SingleBatchIterator(dataset).__iter__()) @@ -157,23 +186,32 @@ def fit(self, # Create new model instance self.model = self.model_class(**model_args) - train_iterator = self.training_iterator_callable(dataset) + training_iterator_callable = training_iterator_callable \ + if training_iterator_callable is not None \ + else self.training_iterator_callable + + train_iterator = training_iterator_callable(dataset) # Train the model - self.trainer.train(self.model, - train_iterator, - self.feature_transformer, - self.label_transform_fun, - **trainer_args) + trainer.train(self.model, + train_iterator, + self.feature_transformer, + self.label_transform_fun, + **trainer_args) def partial_fit(self, dataset: Dataset, trainer_kwargs: Dict = None, trainer: AbstractTrainer = None, training_iterator_callable: Callable[[Dataset], Iterator] = None): - self.check_if_model_exists() + self._check_if_model_exists() trainer = trainer if trainer is not None else self.trainer + if trainer is None: + errmsg = "No trainer provided. Trainer must be provided either in the " \ + "constructor or as an argument to the partial_fit method." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) trainer_kwargs = {} if trainer_kwargs is None else trainer_kwargs trainer_args = self.default_trainer_args.copy() @@ -183,10 +221,10 @@ def partial_fit(self, if training_iterator_callable is not None \ else self.training_iterator_callable - iterator = training_iterator_callable(dataset) + train_iterator = training_iterator_callable(dataset) trainer.train(self.model, - iterator, + train_iterator, self.feature_transformer, self.label_transform_fun, **trainer_args) @@ -212,7 +250,7 @@ def predict(self, """ # TODO: new method of providing examples must be defined. # examples is taken in dataset form as proof-of-concept. - self.check_if_model_exists() + self._check_if_model_exists() y = [] @@ -224,7 +262,7 @@ def predict(self, return np.concatenate(y) - def check_if_model_exists(self): + def _check_if_model_exists(self): if self.model is None: errmsg = "Model instance not available. Please provide a model instance in " \ "the constructor or call `fit` before calling `partial_fit.`" diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py index 734aae9d..aecdd9c9 100644 --- a/takepod/pipeline/pipeline.py +++ b/takepod/pipeline/pipeline.py @@ -14,11 +14,10 @@ def __init__(self, fields: Union[Dict, List], example_format: ExampleFormat, feature_transformer: FeatureTransformer, - model, - predict_kwargs=None, + model: AbstractSupervisedModel, trainer=None, - trainer_kwargs=None, - trainer_iterator=None, + trainer_args=None, + trainer_iterator_callable=None, model_args=None, label_transform_fn=None ): @@ -36,82 +35,49 @@ def __init__(self, _LOGGER.error(error_msg) raise TypeError(error_msg) - self.model = model self.fields = fields self.example_format = example_format - self.feature_transformer = feature_transformer - self.predict_kwargs = predict_kwargs self.example_factory = ExampleFactory(fields) - self.trainer = trainer - self.trainer_kwargs = trainer_kwargs - self.trainer_iterator = trainer_iterator - self.model_args = model_args - self.label_transform_fn = label_transform_fn + self.experiment = Experiment(model, + feature_transformer=feature_transformer, + trainer=trainer, + training_iterator_callable=trainer_iterator_callable, + label_transform_fun=label_transform_fn) + self.experiment.set_default_model_args(**model_args) + self.experiment.set_default_trainer_args(**trainer_args) - def predict(self, raw_example): + def predict_raw(self, raw_example): processed_example = self.example_factory.from_format(raw_example, self.example_format) ds = Dataset([processed_example], self.fields) - x_batch, _ = next(SingleBatchIterator(ds).__iter__()) - x = self.feature_transformer.transform(x_batch) - prediction_dict = self.model.predict(x, **self.predict_kwargs) - return prediction_dict[AbstractSupervisedModel.PREDICTION_KEY] + return self.experiment.predict(ds) + + def predict(self, dataset): + self.experiment.predict(dataset) def fit(self, dataset: Dataset, + feature_transformer=None, trainer=None, - trainer_iterator=None, + trainer_iterator_callable=None, trainer_kwargs=None, - model_kwargs=None, - reset_model=True): - trainer = trainer if trainer is not None else self.trainer - if trainer is None: - errmsg = "No trainer provided. Trainer must be provided either in the " \ - "constructor or as an argument." - _LOGGER.error(errmsg) - raise RuntimeError(errmsg) - - trainer_kwargs = trainer_kwargs if trainer_kwargs is not None \ - else self.trainer_kwargs - if trainer_kwargs is None: - errmsg = "No trainer_kwargs provided. Trainer arguments must be provided " \ - "either in the constructor or as an argument. If no arguments are " \ - "necessary, please pass an empty dict." - _LOGGER.error(errmsg) - raise RuntimeError(errmsg) - - model_kwargs = model_kwargs if model_kwargs is not None \ - else self.model_kwargs - if model_kwargs is None: - errmsg = "No model_kwargs provided. Model arguments must be provided " \ - "either in the constructor or as an argument. If no arguments are " \ - "necessary, please pass an empty dict." - _LOGGER.error(errmsg) - raise RuntimeError(errmsg) - - trainer_iterator = trainer_iterator if trainer_iterator is not None \ - else self.trainer_iterator - if trainer_iterator is None: - errmsg = "No trainer_iterator provided. Trainer_iterator must be provided " \ - "either in the constructor or as an argument. If no arguments are " \ - "necessary, please pass an empty dict." - _LOGGER.error(errmsg) - raise RuntimeError(errmsg) - - experiment = Experiment(self.model, - trainer, - lambda ds: trainer_iterator.set_dataset(ds), - feature_transformer=self.feature_transformer, - label_transform_fun=self.label_transform_fn) - if reset_model: - experiment.fit(dataset, - model_kwargs, - trainer_kwargs) - else: - experiment.partial_fit(dataset, - trainer_kwargs) + model_kwargs=None): + self.experiment.fit(dataset, + model_kwargs=model_kwargs, + trainer_kwargs=trainer_kwargs, + feature_transformer=feature_transformer, + trainer=trainer, + training_iterator_callable=trainer_iterator_callable + ) - self.model = experiment.model - self. + def partial_fit(self, + dataset: Dataset, + trainer=None, + trainer_iterator_callable=None, + trainer_kwargs=None): + self.experiment.partial_fit(dataset, + trainer_kwargs=trainer_kwargs, + trainer=trainer, + trainer_iterator_callable=trainer_iterator_callable) diff --git a/test/models/test_experiment.py b/test/models/test_experiment.py index 37f3927d..877ead7d 100644 --- a/test/models/test_experiment.py +++ b/test/models/test_experiment.py @@ -127,8 +127,8 @@ def train(self, trainer = MockTrainer() experiment = Experiment(MockModel, - trainer, - lambda _: my_iterator, + trainer=trainer, + training_iterator_callable=lambda _: my_iterator, feature_transformer=mock_transformer, label_transform_fun=mock_label_transform_fun) @@ -210,7 +210,7 @@ def train(self, model, iterator, feature_transform_fun=None, experiment = Experiment( MockModel, - MockTrainer(), + trainer=MockTrainer(), training_iterator_callable=lambda _: MockIterator(), prediction_iterator_callable=lambda _: MockIterator() From a09d683c15f836837a4bac791db709e1303ba40f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 26 Sep 2019 19:04:22 +0200 Subject: [PATCH 07/21] Finished pipeline, added documentation --- takepod/examples/experiment_example.py | 7 +- takepod/models/experiment.py | 25 +++++ takepod/models/transformers.py | 2 +- takepod/pipeline/pipeline.py | 133 ++++++++++++++++--------- 4 files changed, 118 insertions(+), 49 deletions(-) diff --git a/takepod/examples/experiment_example.py b/takepod/examples/experiment_example.py index 0d1829e4..bdf1cc55 100644 --- a/takepod/examples/experiment_example.py +++ b/takepod/examples/experiment_example.py @@ -13,7 +13,7 @@ from takepod.validation import k_fold_classification_metrics from takepod.model_selection import grid_search from sklearn.metrics import accuracy_score - +from sklearn.preprocessing import StandardScaler def numericalize_pauza_rating(rating): """Function numericalizes pauza_hr dataset rating field""" @@ -72,7 +72,10 @@ def train_iterator_provider(dataset): feature_transform_fn = partial(feature_transform_mean_fun, embedding_matrix=embedding_matrix) - feature_transformer = FeatureTransformer(feature_transform_fn) + + + + feature_transformer = FeatureTransformer(feature_transform_fn, StandardScaler()) experiment = Experiment(ScikitMLPClassifier, trainer=trainer, diff --git a/takepod/models/experiment.py b/takepod/models/experiment.py index 325badbf..b7df1232 100644 --- a/takepod/models/experiment.py +++ b/takepod/models/experiment.py @@ -204,6 +204,31 @@ def partial_fit(self, trainer_kwargs: Dict = None, trainer: AbstractTrainer = None, training_iterator_callable: Callable[[Dataset], Iterator] = None): + """Fits the model to the data without resetting the model. + + Parameters + ---------- + dataset : Dataset + Dataset to fit the model to. + + trainer_kwargs : dict + Dict containing trainer arguments. Arguments passed to the trainer are the + default arguments defined with `set_default_trainer_args` updated/overridden + by 'trainer_kwargs'. + + trainer : AbstractTrainer, Optional + Trainer used to fit the model. If None, the trainer provided in the + constructor will be used. + + training_iterator_callable: Callable[[Dataset], Iterator] + Callable used to instantiate new instances of the Iterator used in fitting the + model. If None, the training_iterator_callable provided in the + constructor will be used. + + Returns + ------- + + """ self._check_if_model_exists() trainer = trainer if trainer is not None else self.trainer diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index bebb843f..17cf9669 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -60,7 +60,7 @@ def transform(self, return x_tensor else: - self.tensor_transformer.transform(x_tensor) + return self.tensor_transformer.transform(x_tensor) def requires_fitting(self): return self.tensor_transformer is not None and self.requires_fitting_flag diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py index aecdd9c9..c7216673 100644 --- a/takepod/pipeline/pipeline.py +++ b/takepod/pipeline/pipeline.py @@ -1,26 +1,77 @@ -from typing import Union, Dict, List +from typing import Union, Dict, List, Callable, NamedTuple, Any import logging from takepod.storage import ExampleFactory, ExampleFormat -from takepod.datasets import Dataset, SingleBatchIterator -from takepod.models import AbstractSupervisedModel, FeatureTransformer, Experiment +from takepod.datasets import Dataset, Iterator +from takepod.models import AbstractSupervisedModel, FeatureTransformer, Experiment, \ + AbstractTrainer +import numpy as np _LOGGER = logging.getLogger(__name__) -class Pipeline: +class Pipeline(Experiment): + """Class used to streamline the use of Podium. It contains all components needed to + train or fine-tune a pre-configured model and make predictions on new data.""" def __init__(self, fields: Union[Dict, List], example_format: ExampleFormat, feature_transformer: FeatureTransformer, model: AbstractSupervisedModel, - trainer=None, - trainer_args=None, - trainer_iterator_callable=None, - model_args=None, - label_transform_fn=None + trainer: AbstractTrainer = None, + trainer_args: Dict = None, + trainer_iterator_callable: Callable[[Dataset], Iterator] = None, + model_args: Dict = None, + label_transform_fn: Callable[[NamedTuple], np.ndarray] = None ): + """Creates a new pipeline instance. + + Parameters + ---------- + fields : dict or list of fields + Fields used to process raw data. Can be either a dict mapping column names + to Fields (or tuples of Fields), or a list of Fields (or tuples of Fields). + A Field value of None means the corresponding column will + be ignored. + + example_format: ExampleFormat + Format of expected raw examples. + + feature_transformer: FeatureTransformer + FeatureTransformer used to transform data features from the podium "batch" + format into numpy arrays. Will be fitted along with the model to the provided + data. + + model: AbstractSupervisedModel + Model used to make predictions. + + trainer: AbstractTrainer, Optional + Trainer used to fit the model. If provided, this trainer instance will be + stored in the pipeline and used as the default trainer if no trainer is + provided in the `fit` and `partial_fit` methods. + + trainer_args: Dict + Arguments passed as keyword arguments to the trainer during model training. + Arguments provided here are used as default arguments and can be updated by + passing extra arguments to the `fit` and `partial_fit` methods. + + trainer_iterator_callable: Callable[[Dataset], Iterator] + Callable used to instantiate new instances of the Iterator used in fitting the + model. + + model_args: Dict + Arguments passed as keyword arguments to the model during model instantiation. + Arguments provided here are used as default arguments and can be updated by + passing extra arguments to the `fit` method. + + label_transform_fn: Callable[[NamedTuple], np.ndarray] + Callable that transforms the target part of the batch returned by the iterator + into the same format the model prediction is. For a hypothetical perfect model + the prediction result of the model for some examples must be identical to the + result of this callable for those same examples. + + """ if example_format in (ExampleFormat.LIST, ExampleFormat.CSV, ExampleFormat.NLTK): if not isinstance(fields, (list, tuple)): error_msg = "If example format is LIST, CSV or NLTK, `fields`" \ @@ -39,45 +90,35 @@ def __init__(self, self.example_format = example_format self.example_factory = ExampleFactory(fields) - self.experiment = Experiment(model, - feature_transformer=feature_transformer, - trainer=trainer, - training_iterator_callable=trainer_iterator_callable, - label_transform_fun=label_transform_fn) - self.experiment.set_default_model_args(**model_args) - self.experiment.set_default_trainer_args(**trainer_args) + super().__init__(model, + feature_transformer=feature_transformer, + trainer=trainer, + training_iterator_callable=trainer_iterator_callable, + label_transform_fun=label_transform_fn) + self.set_default_model_args(**model_args) + self.set_default_trainer_args(**trainer_args) + + def predict_raw(self, + raw_example: Any, + **kwargs) -> np.ndarray: + """Computes the prediction of the model for the one example. + The example must be of the format provided in the constructor as the + `example_format` parameter. + + Parameters + ---------- + raw_example: Any + Example to compute the prediction for. + + kwargs + Keyword arguments passed to the model's `predict` method - def predict_raw(self, raw_example): + Returns + ------- + ndarray + Tensor containing the prediction for the example.""" processed_example = self.example_factory.from_format(raw_example, self.example_format) ds = Dataset([processed_example], self.fields) - return self.experiment.predict(ds) - - def predict(self, dataset): - self.experiment.predict(dataset) - - def fit(self, - dataset: Dataset, - feature_transformer=None, - trainer=None, - trainer_iterator_callable=None, - trainer_kwargs=None, - model_kwargs=None): - self.experiment.fit(dataset, - model_kwargs=model_kwargs, - trainer_kwargs=trainer_kwargs, - feature_transformer=feature_transformer, - trainer=trainer, - training_iterator_callable=trainer_iterator_callable - ) - - def partial_fit(self, - dataset: Dataset, - trainer=None, - trainer_iterator_callable=None, - trainer_kwargs=None): - self.experiment.partial_fit(dataset, - trainer_kwargs=trainer_kwargs, - trainer=trainer, - trainer_iterator_callable=trainer_iterator_callable) + return self.predict(ds, **kwargs) From 838a90dfe5fa20401c3c30e2e316b10ffe928946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 26 Sep 2019 19:26:35 +0200 Subject: [PATCH 08/21] Added some documentation --- takepod/models/transformers.py | 63 ++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index 17cf9669..1c5c8084 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -5,17 +5,40 @@ class TensorTransformer(ABC): - + """Abstract class used to transform tensors. Used in feature pre-processing during + training and prediction. + """ @abstractmethod def fit(self, x: np.ndarray, y: np.ndarray): + """Fits the transformer to the provided data. + + Parameters + ---------- + + x: np.ndarray + Features in numpy array form. + y: np.ndarray + Labels in numpy array form. + """ pass @abstractmethod def transform(self, x: np.array ) -> np.ndarray: + """Transforms the passed features. + + Parameters + ---------- + x: np.ndarray + Features to be transformed in numpy array form. + + Returns + ------- + np.array + Transformed features.""" pass @@ -35,11 +58,25 @@ def transform(self, # TODO add mechanism for Feature transformer to know if its tensor_transformer needs # fitting so batching can be avoided by callers. class FeatureTransformer: - + """Class used to transform podium batches into features used in model prediction and + training.""" def __init__(self, feature_extraction_fn: Callable[[NamedTuple], np.ndarray], tensor_transformer: TensorTransformer = None, requires_fitting=True): + """Creates a new FeatureTransformer. + + Parameters + ---------- + feature_extraction_fn: Callable[[NamedTuple], np.ndarray] + Callable that takes a podium feature batch as an argument and returns a + numpy tensor representing the data. + tensor_transformer: TensorTransformer + TensorTransformer used to transform the transform the tensors provided by the + `feature_extraction_fn` callable. + requires_fitting: bool + Whether the provided TensorTransformer requires fitting. + """ self.feature_extraction_fn = feature_extraction_fn self.tensor_transformer = tensor_transformer self.requires_fitting_flag = requires_fitting @@ -47,6 +84,16 @@ def __init__(self, def fit(self, x: NamedTuple, y: np.ndarray): + """Fits this tensor transformer to the provided data. + + Parameters + ---------- + x: NamedTuple + Podium feature batch containing the features to be transformed. + + y: np.ndarray + Labels corresponding to the features in `x`. + """ if not self.requires_fitting(): return @@ -55,6 +102,18 @@ def fit(self, def transform(self, x: NamedTuple) -> np.ndarray: + """ + Trasforms the provided podium feature batch into a numpy array. + Parameters + ---------- + x: NamedTuple + Feature batch to be transformed. + + Returns + ------- + np.ndarray + Transformed features. + """ x_tensor = self.feature_extraction_fn(x) if self.tensor_transformer is None: return x_tensor From e0cb520671e93cc6edbd7f75908fd3c6796e2342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 26 Sep 2019 19:50:51 +0200 Subject: [PATCH 09/21] Added transformer test --- takepod/models/__init__.py | 4 +-- test/models/test_transformers.py | 51 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 test/models/test_transformers.py diff --git a/takepod/models/__init__.py b/takepod/models/__init__.py index 71f5ba5d..f11c685a 100644 --- a/takepod/models/__init__.py +++ b/takepod/models/__init__.py @@ -2,10 +2,10 @@ from .model import AbstractFrameworkModel, AbstractSupervisedModel from .batch_transform_functions import default_feature_transform, default_label_transform -from .transformers import FeatureTransformer +from .transformers import FeatureTransformer, TensorTransformer from .experiment import Experiment from .trainer import AbstractTrainer __all__ = ["AbstractFrameworkModel", "AbstractSupervisedModel", "default_feature_transform", "default_label_transform", "Experiment", - "AbstractTrainer", "FeatureTransformer"] + "AbstractTrainer", "FeatureTransformer", "TensorTransformer"] diff --git a/test/models/test_transformers.py b/test/models/test_transformers.py new file mode 100644 index 00000000..d7bb7082 --- /dev/null +++ b/test/models/test_transformers.py @@ -0,0 +1,51 @@ +from collections import namedtuple + +import numpy as np + +from takepod.models import FeatureTransformer, TensorTransformer + + +def test_feature_transformer(): + mock_batch_class = namedtuple("Mock_feature_batch", ("mock_feature",)) + + def mock_feature_extraction_fn(x): + return x.mock_feature + + class MockTensorTransformer(TensorTransformer): + + def __init__(self): + self.fit_called = False + + def fit(self, + x: np.ndarray, + y: np.ndarray): + self.fit_called = True + assert np.all(x == np.array([[1, 2], [3, 4]])) + assert np.all(y == np.array([1, 2])) + + def transform(self, + x: np.array + ) -> np.ndarray: + assert np.all(x == np.array([[4, 5], [6, 7]])) + return np.array([3, 4]) + + mock_tensor_transformer = MockTensorTransformer() + feature_transformer = FeatureTransformer(mock_feature_extraction_fn, + mock_tensor_transformer, + requires_fitting=True) + + mock_feature_batch = mock_batch_class(mock_feature=np.array([[1, 2], [3, 4]])) + y = np.array([1, 2]) + + feature_transformer.fit(mock_feature_batch, y) + assert mock_tensor_transformer.fit_called + + mock_feature_batch_2 = mock_batch_class(mock_feature=np.array([[4, 5], [6, 7]])) + assert np.all(feature_transformer.transform(mock_feature_batch_2) == np.array([3, 4])) + + mock_tensor_transformer = MockTensorTransformer() + feature_transformer = FeatureTransformer(mock_feature_extraction_fn, + mock_tensor_transformer, + requires_fitting=False) + feature_transformer.fit(mock_feature_batch, y) + assert not mock_tensor_transformer.fit_called From 3b4e2fc6eba10c8d5af9597f26b51696964c6f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 26 Sep 2019 19:56:02 +0200 Subject: [PATCH 10/21] style fixes --- takepod/examples/experiment_example.py | 5 ++--- takepod/models/experiment.py | 2 +- takepod/storage/example_factory.py | 14 +++++++------- test/models/test_experiment.py | 2 ++ 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/takepod/examples/experiment_example.py b/takepod/examples/experiment_example.py index bdf1cc55..ee0ef6ec 100644 --- a/takepod/examples/experiment_example.py +++ b/takepod/examples/experiment_example.py @@ -15,6 +15,7 @@ from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler + def numericalize_pauza_rating(rating): """Function numericalizes pauza_hr dataset rating field""" label = round(float(rating) * 2) - 1 @@ -71,9 +72,7 @@ def train_iterator_provider(dataset): fields["Text"].vocab) feature_transform_fn = partial(feature_transform_mean_fun, - embedding_matrix=embedding_matrix) - - + embedding_matrix=embedding_matrix) feature_transformer = FeatureTransformer(feature_transform_fn, StandardScaler()) diff --git a/takepod/models/experiment.py b/takepod/models/experiment.py index b7df1232..d33157ca 100644 --- a/takepod/models/experiment.py +++ b/takepod/models/experiment.py @@ -35,7 +35,7 @@ def __init__(self, model : class or model instance Class of the Model to be fitted or a pre-trained model. If pre-trained model is passed and `fit` is called a new model instance will - be created. For fine-tuning of the passed model instance call + be created. For fine-tuning of the passed model instance call `partial_fit`. Must be a subclass of Podium's `AbstractSupervisedModel` diff --git a/takepod/storage/example_factory.py b/takepod/storage/example_factory.py index 148e3b2a..3efdc0bf 100644 --- a/takepod/storage/example_factory.py +++ b/takepod/storage/example_factory.py @@ -4,7 +4,7 @@ import logging import json import csv -from enum import Enum, auto +from enum import Enum import xml.etree.ElementTree as ET from takepod.storage.field import unpack_fields @@ -13,12 +13,12 @@ class ExampleFormat(Enum): - LIST = lambda data, factory: factory.from_list(data) - DICT = lambda data, factory: factory.from_dict(data) - CSV = lambda data, factory: factory.from_csv(data) - NLTK = lambda data, factory: factory.from_fields_tree(data) - XML = lambda data, factory: factory.from_xml_str(data) - JSON = lambda data, factory: factory.from_json(data) + LIST = lambda data, factory: factory.from_list(data) # noqa: E731 + DICT = lambda data, factory: factory.from_dict(data) # noqa: E731 + CSV = lambda data, factory: factory.from_csv(data) # noqa: E731 + NLTK = lambda data, factory: factory.from_fields_tree(data) # noqa: E731 + XML = lambda data, factory: factory.from_xml_str(data) # noqa: E731 + JSON = lambda data, factory: factory.from_json(data) # noqa: E731 class Example: diff --git a/test/models/test_experiment.py b/test/models/test_experiment.py index 877ead7d..55da266d 100644 --- a/test/models/test_experiment.py +++ b/test/models/test_experiment.py @@ -36,9 +36,11 @@ def dataset(): ds.finalize_fields() return ds + def MockDataset(): pass + def mock_feature_transform_fun(x_batch): return x_batch.Score From 2cf1cd87b8b029eab659facd29c8a3d2734d7885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Thu, 26 Sep 2019 20:01:33 +0200 Subject: [PATCH 11/21] updated __init__ --- takepod/__init__.py | 3 ++- takepod/pipeline/__init__.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/takepod/__init__.py b/takepod/__init__.py index b7e65df9..f81a36d2 100644 --- a/takepod/__init__.py +++ b/takepod/__init__.py @@ -22,7 +22,8 @@ "models", "preproc", "storage", - "validation"] + "validation", + "pipeline"] # Reference for initialization of logging scikit-learn diff --git a/takepod/pipeline/__init__.py b/takepod/pipeline/__init__.py index e69de29b..d2381c3a 100644 --- a/takepod/pipeline/__init__.py +++ b/takepod/pipeline/__init__.py @@ -0,0 +1,3 @@ +from .pipeline import Pipeline + +__all__ = ["Pipeline"] From ad28d4c76de2b84b8d2375e8b04e2b2045d72846 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Fri, 27 Sep 2019 14:09:36 +0200 Subject: [PATCH 12/21] Implemented partial test for pipeline predict_raw --- takepod/pipeline/pipeline.py | 16 ++++-- test/pipeline/__init__.py | 0 test/pipeline/test_pipeline.py | 90 ++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 test/pipeline/__init__.py create mode 100644 test/pipeline/test_pipeline.py diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py index c7216673..f813c59c 100644 --- a/takepod/pipeline/pipeline.py +++ b/takepod/pipeline/pipeline.py @@ -1,4 +1,4 @@ -from typing import Union, Dict, List, Callable, NamedTuple, Any +from typing import Union, Dict, List, Callable, NamedTuple, Any, Type import logging from takepod.storage import ExampleFactory, ExampleFormat @@ -18,7 +18,7 @@ def __init__(self, fields: Union[Dict, List], example_format: ExampleFormat, feature_transformer: FeatureTransformer, - model: AbstractSupervisedModel, + model: Union[AbstractSupervisedModel, Type[AbstractSupervisedModel]], trainer: AbstractTrainer = None, trainer_args: Dict = None, trainer_iterator_callable: Callable[[Dataset], Iterator] = None, @@ -43,8 +43,12 @@ def __init__(self, format into numpy arrays. Will be fitted along with the model to the provided data. - model: AbstractSupervisedModel - Model used to make predictions. + model : class or model instance + Class of the Model to be fitted or a pre-trained model. + If pre-trained model is passed and `fit` is called a new model instance will + be created. For fine-tuning of the passed model instance call + `partial_fit`. + Must be a subclass of Podium's `AbstractSupervisedModel` trainer: AbstractTrainer, Optional Trainer used to fit the model. If provided, this trainer instance will be @@ -95,6 +99,10 @@ def __init__(self, trainer=trainer, training_iterator_callable=trainer_iterator_callable, label_transform_fun=label_transform_fn) + + model_args = {} if model_args is None else model_args + trainer_args = {} if trainer_args is None else trainer_args + self.set_default_model_args(**model_args) self.set_default_trainer_args(**trainer_args) diff --git a/test/pipeline/__init__.py b/test/pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py new file mode 100644 index 00000000..69cf0bce --- /dev/null +++ b/test/pipeline/test_pipeline.py @@ -0,0 +1,90 @@ +import pytest +import numpy as np + +from takepod.pipeline import Pipeline +from takepod.storage import Field, Vocab, ExampleFormat +from takepod.models import AbstractSupervisedModel + + +@pytest.fixture +def fields(): + name_dict = { + "Marko": 1, + "Darko": 2, + "Ivana": 3 + } + + name_field = Field("Name", custom_numericalize=name_dict.get) + score_field = Field("Score", tokenize=False, custom_numericalize=int) + + name_field.finalize() + score_field.finalize() + + return {"Name": name_field, + "Score": score_field} + + +mock_data = [ + ["Marko", 50], + ["Darko", 60], + ["Ivana", 45] +] + + +class MockModel: + + def fit(self, *args, **kwargs): + pass + + def predict(self, x, **kwargs): + return {AbstractSupervisedModel.PREDICTION_KEY: x} + + +class MockTrainer: + def train(self, *args, **kwargs): + pass + + +class MockFeatureTransformer: + + def transform(self, x_batch): + return np.hstack((x_batch.Name, x_batch.Score)) + + +def test_pipeline_from_raw(fields): + + # Test for list format + fields_list = [fields['Name'], fields['Score']] + list_pipeline = Pipeline(fields_list, + ExampleFormat.LIST, + feature_transformer=MockFeatureTransformer(), + model=MockModel()) + + raw_list = ["Marko", 30] + expected_prediction = np.array([[1, 30]]) + prediction = list_pipeline.predict_raw(raw_list) + + assert np.all(expected_prediction == prediction) + + fields_dict = {field.name: field for field in fields_list} + dict_pipeline = Pipeline(fields_dict, + ExampleFormat.DICT, + feature_transformer=MockFeatureTransformer(), + model=MockModel()) + + # Test for Dict format + raw_dict = {'Name': "Marko", 'Score': 30} + expected_prediction = np.array([[1, 30]]) + prediction = dict_pipeline.predict_raw(raw_dict) + + assert np.all(expected_prediction == prediction) + + # Test for csv + raw_csv = "Marko,30" + csv_pipeline = Pipeline(fields_list, + ExampleFormat.CSV, + feature_transformer=MockFeatureTransformer(), + model=MockModel()) + expected_prediction = np.array([[1, 30]]) + prediction = csv_pipeline.predict_raw(raw_csv) + assert np.all(expected_prediction == prediction) From 20906aae066a08bbe24213e4095503cd2e6f3d86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Fri, 27 Sep 2019 14:10:17 +0200 Subject: [PATCH 13/21] Style correction --- test/pipeline/test_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 69cf0bce..c9726b00 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -2,7 +2,7 @@ import numpy as np from takepod.pipeline import Pipeline -from takepod.storage import Field, Vocab, ExampleFormat +from takepod.storage import Field, ExampleFormat from takepod.models import AbstractSupervisedModel From 00383bb8487d1240cb810ee803d25aec80e29651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 7 Oct 2019 14:27:22 +0200 Subject: [PATCH 14/21] Removed model_args and trainer_args from pipeline constructor --- takepod/pipeline/pipeline.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py index f813c59c..897eedfb 100644 --- a/takepod/pipeline/pipeline.py +++ b/takepod/pipeline/pipeline.py @@ -20,9 +20,7 @@ def __init__(self, feature_transformer: FeatureTransformer, model: Union[AbstractSupervisedModel, Type[AbstractSupervisedModel]], trainer: AbstractTrainer = None, - trainer_args: Dict = None, trainer_iterator_callable: Callable[[Dataset], Iterator] = None, - model_args: Dict = None, label_transform_fn: Callable[[NamedTuple], np.ndarray] = None ): """Creates a new pipeline instance. @@ -55,20 +53,10 @@ def __init__(self, stored in the pipeline and used as the default trainer if no trainer is provided in the `fit` and `partial_fit` methods. - trainer_args: Dict - Arguments passed as keyword arguments to the trainer during model training. - Arguments provided here are used as default arguments and can be updated by - passing extra arguments to the `fit` and `partial_fit` methods. - trainer_iterator_callable: Callable[[Dataset], Iterator] Callable used to instantiate new instances of the Iterator used in fitting the model. - model_args: Dict - Arguments passed as keyword arguments to the model during model instantiation. - Arguments provided here are used as default arguments and can be updated by - passing extra arguments to the `fit` method. - label_transform_fn: Callable[[NamedTuple], np.ndarray] Callable that transforms the target part of the batch returned by the iterator into the same format the model prediction is. For a hypothetical perfect model @@ -100,12 +88,6 @@ def __init__(self, training_iterator_callable=trainer_iterator_callable, label_transform_fun=label_transform_fn) - model_args = {} if model_args is None else model_args - trainer_args = {} if trainer_args is None else trainer_args - - self.set_default_model_args(**model_args) - self.set_default_trainer_args(**trainer_args) - def predict_raw(self, raw_example: Any, **kwargs) -> np.ndarray: From 48270a7232f9160eac12809b685c2a4e71f026e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 7 Oct 2019 14:48:58 +0200 Subject: [PATCH 15/21] Removed duplicated code in fit and partial_fit --- takepod/models/experiment.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/takepod/models/experiment.py b/takepod/models/experiment.py index d33157ca..15503fd3 100644 --- a/takepod/models/experiment.py +++ b/takepod/models/experiment.py @@ -160,14 +160,10 @@ def fit(self, """ model_kwargs = {} if model_kwargs is None else model_kwargs - trainer_kwargs = {} if trainer_kwargs is None else trainer_kwargs model_args = self.default_model_args.copy() model_args.update(model_kwargs) - trainer_args = self.default_trainer_args.copy() - trainer_args.update(trainer_kwargs) - trainer = trainer if trainer is not None else self.trainer if trainer is None: errmsg = "No trainer provided. Trainer must be provided either in the " \ @@ -187,17 +183,11 @@ def fit(self, # Create new model instance self.model = self.model_class(**model_args) - training_iterator_callable = training_iterator_callable \ - if training_iterator_callable is not None \ - else self.training_iterator_callable - - train_iterator = training_iterator_callable(dataset) # Train the model - trainer.train(self.model, - train_iterator, - self.feature_transformer, - self.label_transform_fun, - **trainer_args) + self.partial_fit(dataset, + trainer_kwargs, + trainer, + training_iterator_callable) def partial_fit(self, dataset: Dataset, From 6694d75e81fb3eb03d6e19b72a4e597f7e53607c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 7 Oct 2019 15:51:50 +0200 Subject: [PATCH 16/21] Added SklearnTensorTransformerWrapper --- takepod/examples/experiment_example.py | 5 +++-- takepod/models/__init__.py | 6 ++++-- takepod/models/transformers.py | 24 +++++++++++++++--------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/takepod/examples/experiment_example.py b/takepod/examples/experiment_example.py index ee0ef6ec..12627705 100644 --- a/takepod/examples/experiment_example.py +++ b/takepod/examples/experiment_example.py @@ -9,7 +9,7 @@ from takepod.datasets.impl.pauza_dataset import PauzaHRDataset from takepod.models.impl.fc_model import ScikitMLPClassifier from takepod.models.impl.simple_trainers import SimpleTrainer -from takepod.models import Experiment, FeatureTransformer +from takepod.models import Experiment, FeatureTransformer, SklearnTensorTransformerWrapper from takepod.validation import k_fold_classification_metrics from takepod.model_selection import grid_search from sklearn.metrics import accuracy_score @@ -74,7 +74,8 @@ def train_iterator_provider(dataset): feature_transform_fn = partial(feature_transform_mean_fun, embedding_matrix=embedding_matrix) - feature_transformer = FeatureTransformer(feature_transform_fn, StandardScaler()) + tensor_transformer = SklearnTensorTransformerWrapper(StandardScaler()) + feature_transformer = FeatureTransformer(feature_transform_fn, tensor_transformer) experiment = Experiment(ScikitMLPClassifier, trainer=trainer, diff --git a/takepod/models/__init__.py b/takepod/models/__init__.py index f11c685a..6d24792e 100644 --- a/takepod/models/__init__.py +++ b/takepod/models/__init__.py @@ -2,10 +2,12 @@ from .model import AbstractFrameworkModel, AbstractSupervisedModel from .batch_transform_functions import default_feature_transform, default_label_transform -from .transformers import FeatureTransformer, TensorTransformer +from .transformers import FeatureTransformer, TensorTransformer, \ + SklearnTensorTransformerWrapper from .experiment import Experiment from .trainer import AbstractTrainer __all__ = ["AbstractFrameworkModel", "AbstractSupervisedModel", "default_feature_transform", "default_label_transform", "Experiment", - "AbstractTrainer", "FeatureTransformer", "TensorTransformer"] + "AbstractTrainer", "FeatureTransformer", "TensorTransformer", + "SklearnTensorTransformerWrapper"] diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index 1c5c8084..53c586f0 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -8,6 +8,7 @@ class TensorTransformer(ABC): """Abstract class used to transform tensors. Used in feature pre-processing during training and prediction. """ + @abstractmethod def fit(self, x: np.ndarray, @@ -41,18 +42,25 @@ def transform(self, Transformed features.""" pass + def requires_fitting(self) -> bool: + return True + + +class SklearnTensorTransformerWrapper(TensorTransformer): -class DummyTensorTransformer(TensorTransformer): + def __init__(self, + feature_transformer): + self.feature_transformer = feature_transformer def fit(self, x: np.ndarray, y: np.ndarray): - pass + self.feature_transformer.fit(x, y) def transform(self, x: np.array ) -> np.ndarray: - return x + return self.feature_transformer.transform(x) # TODO add mechanism for Feature transformer to know if its tensor_transformer needs @@ -60,10 +68,10 @@ def transform(self, class FeatureTransformer: """Class used to transform podium batches into features used in model prediction and training.""" + def __init__(self, feature_extraction_fn: Callable[[NamedTuple], np.ndarray], - tensor_transformer: TensorTransformer = None, - requires_fitting=True): + tensor_transformer: TensorTransformer = None): """Creates a new FeatureTransformer. Parameters @@ -74,12 +82,9 @@ def __init__(self, tensor_transformer: TensorTransformer TensorTransformer used to transform the transform the tensors provided by the `feature_extraction_fn` callable. - requires_fitting: bool - Whether the provided TensorTransformer requires fitting. """ self.feature_extraction_fn = feature_extraction_fn self.tensor_transformer = tensor_transformer - self.requires_fitting_flag = requires_fitting def fit(self, x: NamedTuple, @@ -122,4 +127,5 @@ def transform(self, return self.tensor_transformer.transform(x_tensor) def requires_fitting(self): - return self.tensor_transformer is not None and self.requires_fitting_flag + return self.tensor_transformer is not None \ + and self.tensor_transformer.requires_fitting() From 2a17a47da5aaba14eb1de33ff699979e8d83c412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Mon, 7 Oct 2019 16:28:05 +0200 Subject: [PATCH 17/21] Added Documentation, fixed tests --- takepod/models/transformers.py | 43 +++++++++++++++++++++++++++--- takepod/storage/example_factory.py | 4 +-- test/models/test_experiment.py | 9 ++++--- test/models/test_transformers.py | 19 ++++++++----- test/pipeline/test_pipeline.py | 7 +++-- 5 files changed, 62 insertions(+), 20 deletions(-) diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index 53c586f0..9580f761 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -42,15 +42,36 @@ def transform(self, Transformed features.""" pass + @abstractmethod def requires_fitting(self) -> bool: - return True + """Returns True if this TensorTransformer requires fitting. + + Returns + ------- + True if this TensorTransformer requires fitting, else returns False. + """ + pass class SklearnTensorTransformerWrapper(TensorTransformer): + """Wrapper class for Sklearn feature transformers.""" def __init__(self, - feature_transformer): + feature_transformer, + requires_fitting=True): + """Creates a new SklearnTensorTransformerWrapper. + + Parameters + ---------- + feature_transformer + The sklearn feature transformer to be wrapped. Example of this would be + a sklean pipeline containing a sequence of transformations. + + requires_fitting: bool + Whether this tensor transformer should be fitted. + """ self.feature_transformer = feature_transformer + self.requires_fitting_flag = requires_fitting def fit(self, x: np.ndarray, @@ -62,6 +83,9 @@ def transform(self, ) -> np.ndarray: return self.feature_transformer.transform(x) + def requires_fitting(self) -> bool: + return self.requires_fitting_flag + # TODO add mechanism for Feature transformer to know if its tensor_transformer needs # fitting so batching can be avoided by callers. @@ -78,7 +102,8 @@ def __init__(self, ---------- feature_extraction_fn: Callable[[NamedTuple], np.ndarray] Callable that takes a podium feature batch as an argument and returns a - numpy tensor representing the data. + numpy array representing the data. + tensor_transformer: TensorTransformer TensorTransformer used to transform the transform the tensors provided by the `feature_extraction_fn` callable. @@ -127,5 +152,15 @@ def transform(self, return self.tensor_transformer.transform(x_tensor) def requires_fitting(self): + """Returns True if the contained TensorTransformer exists and requires fitting, + else returns None. + + Returns + ------- + True if the contained TensorTransformer exists and requires fitting, + else returns False. + ------- + + """ return self.tensor_transformer is not None \ - and self.tensor_transformer.requires_fitting() + and self.tensor_transformer.requires_fitting() diff --git a/takepod/storage/example_factory.py b/takepod/storage/example_factory.py index 3efdc0bf..768619c8 100644 --- a/takepod/storage/example_factory.py +++ b/takepod/storage/example_factory.py @@ -263,8 +263,8 @@ def from_fields_tree(self, data, subtrees=False): def from_format(self, data, - format: ExampleFormat): - return format(data, self) + format_tag: ExampleFormat): + return format_tag(data, self) def tree_to_list(tree): diff --git a/test/models/test_experiment.py b/test/models/test_experiment.py index 55da266d..e6d7e6ae 100644 --- a/test/models/test_experiment.py +++ b/test/models/test_experiment.py @@ -7,8 +7,7 @@ from takepod.storage import Field, ExampleFactory, Vocab -@pytest.fixture -def dataset(): +def get_dataset(): data = [{"Name": "Mark Dark", "Score": 5}, {"Name": "Stephen Smith", @@ -37,7 +36,7 @@ def dataset(): return ds -def MockDataset(): +class MockDataset: pass @@ -67,7 +66,9 @@ def requires_fitting(self): @pytest.mark.parametrize("fit_transformer", (False, True)) -def test_experiment_train(dataset, fit_transformer): +def test_experiment_train(fit_transformer): + dataset = get_dataset() + default_model_args = { 'm_arg1': 1, 'm_arg2': 2 diff --git a/test/models/test_transformers.py b/test/models/test_transformers.py index d7bb7082..ab876ecf 100644 --- a/test/models/test_transformers.py +++ b/test/models/test_transformers.py @@ -29,10 +29,17 @@ def transform(self, assert np.all(x == np.array([[4, 5], [6, 7]])) return np.array([3, 4]) + def requires_fitting(self) -> bool: + return True + + class MockTensorTransformerNoFitting(MockTensorTransformer): + + def requires_fitting(self): + return False + mock_tensor_transformer = MockTensorTransformer() feature_transformer = FeatureTransformer(mock_feature_extraction_fn, - mock_tensor_transformer, - requires_fitting=True) + mock_tensor_transformer) mock_feature_batch = mock_batch_class(mock_feature=np.array([[1, 2], [3, 4]])) y = np.array([1, 2]) @@ -43,9 +50,9 @@ def transform(self, mock_feature_batch_2 = mock_batch_class(mock_feature=np.array([[4, 5], [6, 7]])) assert np.all(feature_transformer.transform(mock_feature_batch_2) == np.array([3, 4])) - mock_tensor_transformer = MockTensorTransformer() + mock_tensor_transformer_no_fit = MockTensorTransformerNoFitting() feature_transformer = FeatureTransformer(mock_feature_extraction_fn, - mock_tensor_transformer, - requires_fitting=False) + mock_tensor_transformer_no_fit) feature_transformer.fit(mock_feature_batch, y) - assert not mock_tensor_transformer.fit_called + assert not mock_tensor_transformer_no_fit.fit_called + assert np.all(feature_transformer.transform(mock_feature_batch_2) == np.array([3, 4])) diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index c9726b00..0bdcf57a 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -1,4 +1,3 @@ -import pytest import numpy as np from takepod.pipeline import Pipeline @@ -6,8 +5,7 @@ from takepod.models import AbstractSupervisedModel -@pytest.fixture -def fields(): +def get_fields(): name_dict = { "Marko": 1, "Darko": 2, @@ -51,7 +49,8 @@ def transform(self, x_batch): return np.hstack((x_batch.Name, x_batch.Score)) -def test_pipeline_from_raw(fields): +def test_pipeline_from_raw(): + fields = get_fields() # Test for list format fields_list = [fields['Name'], fields['Score']] From 35f2831a302b96dc7bc1bdceb5a078af947b2216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 9 Oct 2019 18:30:27 +0200 Subject: [PATCH 18/21] Added test for sklearn wrapper --- takepod/models/transformers.py | 5 ++-- test/models/test_transformers.py | 48 +++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index 9580f761..8b40311e 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -76,7 +76,8 @@ def __init__(self, def fit(self, x: np.ndarray, y: np.ndarray): - self.feature_transformer.fit(x, y) + if self.requires_fitting(): + self.feature_transformer.fit(x, y) def transform(self, x: np.array @@ -87,8 +88,6 @@ def requires_fitting(self) -> bool: return self.requires_fitting_flag -# TODO add mechanism for Feature transformer to know if its tensor_transformer needs -# fitting so batching can be avoided by callers. class FeatureTransformer: """Class used to transform podium batches into features used in model prediction and training.""" diff --git a/test/models/test_transformers.py b/test/models/test_transformers.py index ab876ecf..9ad5e0ef 100644 --- a/test/models/test_transformers.py +++ b/test/models/test_transformers.py @@ -1,8 +1,9 @@ from collections import namedtuple import numpy as np +from sklearn.preprocessing import StandardScaler -from takepod.models import FeatureTransformer, TensorTransformer +from takepod.models import FeatureTransformer, TensorTransformer, SklearnTensorTransformerWrapper def test_feature_transformer(): @@ -56,3 +57,48 @@ def requires_fitting(self): feature_transformer.fit(mock_feature_batch, y) assert not mock_tensor_transformer_no_fit.fit_called assert np.all(feature_transformer.transform(mock_feature_batch_2) == np.array([3, 4])) + + +def test_sklearn_feature_transformer_wrapper(mocker): + class MockSklearnTransformer: + + def fit(self, x, y): + pass + + def transform(self, x): + return x + 1 + + # test with fitting + tensor_transformer = MockSklearnTransformer() + mocker.spy(tensor_transformer, 'fit') + mocker.spy(tensor_transformer, 'transform') + + mock_feature_batch = np.array([[1, 2, 3]]) + mock_label_batch = np.array([[2, 3, 4]]) + + wrapper = SklearnTensorTransformerWrapper(tensor_transformer, requires_fitting=True) + + assert wrapper.requires_fitting() + + wrapper.fit(mock_feature_batch, mock_label_batch) + tensor_transformer.fit.assert_called_once_with(mock_feature_batch, mock_label_batch) + + result = tensor_transformer.transform(mock_feature_batch) + tensor_transformer.transform.assert_called_once_with(mock_feature_batch) + assert np.all(result == mock_feature_batch + 1) + + # test without fitting + tensor_transformer = MockSklearnTransformer() + mocker.spy(tensor_transformer, 'fit') + mocker.spy(tensor_transformer, 'transform') + + wrapper = SklearnTensorTransformerWrapper(tensor_transformer, requires_fitting=False) + + assert not wrapper.requires_fitting() + + wrapper.fit(mock_feature_batch, mock_label_batch) + tensor_transformer.fit.assert_not_called() + + result = tensor_transformer.transform(mock_feature_batch) + tensor_transformer.transform.assert_called_once_with(mock_feature_batch) + assert np.all(result == mock_feature_batch + 1) \ No newline at end of file From 43c706079068b632b52071bd030598b7309fc04c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 9 Oct 2019 18:32:12 +0200 Subject: [PATCH 19/21] Added test for sklearn wrapper --- test/models/test_transformers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/models/test_transformers.py b/test/models/test_transformers.py index 9ad5e0ef..33efa300 100644 --- a/test/models/test_transformers.py +++ b/test/models/test_transformers.py @@ -1,9 +1,9 @@ from collections import namedtuple import numpy as np -from sklearn.preprocessing import StandardScaler -from takepod.models import FeatureTransformer, TensorTransformer, SklearnTensorTransformerWrapper +from takepod.models import FeatureTransformer, TensorTransformer, \ + SklearnTensorTransformerWrapper def test_feature_transformer(): @@ -101,4 +101,4 @@ def transform(self, x): result = tensor_transformer.transform(mock_feature_batch) tensor_transformer.transform.assert_called_once_with(mock_feature_batch) - assert np.all(result == mock_feature_batch + 1) \ No newline at end of file + assert np.all(result == mock_feature_batch + 1) From d13ddc0d610efc9cdda3ee1f69fb7cc4d4865f36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 9 Oct 2019 19:33:34 +0200 Subject: [PATCH 20/21] Improved tests --- test/models/test_transformers.py | 66 ++++++++++++++++---------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/test/models/test_transformers.py b/test/models/test_transformers.py index 33efa300..e5d511be 100644 --- a/test/models/test_transformers.py +++ b/test/models/test_transformers.py @@ -6,57 +6,59 @@ SklearnTensorTransformerWrapper -def test_feature_transformer(): - mock_batch_class = namedtuple("Mock_feature_batch", ("mock_feature",)) +class MockTensorTransformer(TensorTransformer): - def mock_feature_extraction_fn(x): - return x.mock_feature + def __init__(self, requires_fitting): + self.requires_fitting_flag = requires_fitting + + def fit(self, + x: np.ndarray, + y: np.ndarray): + pass - class MockTensorTransformer(TensorTransformer): + def transform(self, + x: np.array + ) -> np.ndarray: + return [3, 4] - def __init__(self): - self.fit_called = False + def requires_fitting(self) -> bool: + return self.requires_fitting_flag - def fit(self, - x: np.ndarray, - y: np.ndarray): - self.fit_called = True - assert np.all(x == np.array([[1, 2], [3, 4]])) - assert np.all(y == np.array([1, 2])) - def transform(self, - x: np.array - ) -> np.ndarray: - assert np.all(x == np.array([[4, 5], [6, 7]])) - return np.array([3, 4]) +def test_feature_transformer(mocker): + mock_batch_class = namedtuple("Mock_feature_batch", ("mock_feature",)) - def requires_fitting(self) -> bool: - return True + def mock_feature_extraction_fn(x): + return x.mock_feature - class MockTensorTransformerNoFitting(MockTensorTransformer): + mock_tensor_transformer = MockTensorTransformer(requires_fitting=True) - def requires_fitting(self): - return False + mocker.spy(mock_tensor_transformer, 'fit') + mocker.spy(mock_tensor_transformer, 'transform') - mock_tensor_transformer = MockTensorTransformer() feature_transformer = FeatureTransformer(mock_feature_extraction_fn, mock_tensor_transformer) - mock_feature_batch = mock_batch_class(mock_feature=np.array([[1, 2], [3, 4]])) - y = np.array([1, 2]) + mock_feature_batch = mock_batch_class(mock_feature=[1, 2]) + y = [3, 4] feature_transformer.fit(mock_feature_batch, y) - assert mock_tensor_transformer.fit_called + mock_tensor_transformer.fit.assert_called_once_with([1, 2], [3, 4]) - mock_feature_batch_2 = mock_batch_class(mock_feature=np.array([[4, 5], [6, 7]])) - assert np.all(feature_transformer.transform(mock_feature_batch_2) == np.array([3, 4])) + mock_feature_batch_2 = mock_batch_class(mock_feature=[4, 5]) + assert np.all(feature_transformer.transform(mock_feature_batch_2) == [3, 4]) + mock_tensor_transformer.transform.assert_called_once_with([4, 5]) - mock_tensor_transformer_no_fit = MockTensorTransformerNoFitting() + mock_tensor_transformer_no_fit = MockTensorTransformer(requires_fitting=False) + mocker.spy(mock_tensor_transformer_no_fit, 'fit') + mocker.spy(mock_tensor_transformer_no_fit, 'transform') feature_transformer = FeatureTransformer(mock_feature_extraction_fn, mock_tensor_transformer_no_fit) + feature_transformer.fit(mock_feature_batch, y) - assert not mock_tensor_transformer_no_fit.fit_called - assert np.all(feature_transformer.transform(mock_feature_batch_2) == np.array([3, 4])) + mock_tensor_transformer_no_fit.fit.assert_not_called() + assert np.all(feature_transformer.transform(mock_feature_batch_2) == [3, 4]) + mock_tensor_transformer_no_fit.transform.assert_called_once_with([4, 5]) def test_sklearn_feature_transformer_wrapper(mocker): From 34da938c3b702801aa56466db95b6e0cee9101b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Smokovi=C4=87?= Date: Wed, 9 Oct 2019 20:06:39 +0200 Subject: [PATCH 21/21] addressed minor PR complaints --- takepod/models/transformers.py | 7 ++++--- test/models/test_experiment.py | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py index 8b40311e..6cfe18c3 100644 --- a/takepod/models/transformers.py +++ b/takepod/models/transformers.py @@ -6,7 +6,8 @@ class TensorTransformer(ABC): """Abstract class used to transform tensors. Used in feature pre-processing during - training and prediction. + training and prediction. Usually used in FeatureTransformer to transform tensors + returned by the feature extraction callable. """ @abstractmethod @@ -27,7 +28,7 @@ def fit(self, @abstractmethod def transform(self, - x: np.array + x: np.ndarray ) -> np.ndarray: """Transforms the passed features. @@ -89,7 +90,7 @@ def requires_fitting(self) -> bool: class FeatureTransformer: - """Class used to transform podium batches into features used in model prediction and + """Class used to transform Dataset batches into features used in model prediction and training.""" def __init__(self, diff --git a/test/models/test_experiment.py b/test/models/test_experiment.py index e6d7e6ae..d50ca46c 100644 --- a/test/models/test_experiment.py +++ b/test/models/test_experiment.py @@ -3,7 +3,7 @@ import pytest import numpy as np from takepod.models import AbstractSupervisedModel, Experiment -from takepod.datasets import Dataset, Iterator +from takepod.datasets import Dataset from takepod.storage import Field, ExampleFactory, Vocab @@ -65,6 +65,10 @@ def requires_fitting(self): return self.to_fit +class MockIterator: + pass + + @pytest.mark.parametrize("fit_transformer", (False, True)) def test_experiment_train(fit_transformer): dataset = get_dataset() @@ -103,7 +107,7 @@ def test_experiment_train(fit_transformer): mock_transformer = MockTransformer(fit_transformer) - my_iterator = Iterator(dataset) + my_iterator = MockIterator() class MockModel: def __init__(self, **kwargs):