diff --git a/takepod/__init__.py b/takepod/__init__.py index b7e65df9..f81a36d2 100644 --- a/takepod/__init__.py +++ b/takepod/__init__.py @@ -22,7 +22,8 @@ "models", "preproc", "storage", - "validation"] + "validation", + "pipeline"] # Reference for initialization of logging scikit-learn diff --git a/takepod/examples/experiment_example.py b/takepod/examples/experiment_example.py index 0111927f..12627705 100644 --- a/takepod/examples/experiment_example.py +++ b/takepod/examples/experiment_example.py @@ -9,10 +9,11 @@ from takepod.datasets.impl.pauza_dataset import PauzaHRDataset from takepod.models.impl.fc_model import ScikitMLPClassifier from takepod.models.impl.simple_trainers import SimpleTrainer -from takepod.models import Experiment +from takepod.models import Experiment, FeatureTransformer, SklearnTensorTransformerWrapper from takepod.validation import k_fold_classification_metrics from takepod.model_selection import grid_search from sklearn.metrics import accuracy_score +from sklearn.preprocessing import StandardScaler def numericalize_pauza_rating(rating): @@ -70,15 +71,17 @@ def train_iterator_provider(dataset): embedding_matrix = vectorizer.get_embedding_matrix( fields["Text"].vocab) - feature_transform = partial(feature_transform_mean_fun, - embedding_matrix=embedding_matrix) + feature_transform_fn = partial(feature_transform_mean_fun, + embedding_matrix=embedding_matrix) + + tensor_transformer = SklearnTensorTransformerWrapper(StandardScaler()) + feature_transformer = FeatureTransformer(feature_transform_fn, tensor_transformer) experiment = Experiment(ScikitMLPClassifier, - trainer, - train_iterator_provider, - None, - feature_transform, - label_transform_fun) + trainer=trainer, + training_iterator_callable=train_iterator_provider, + feature_transformer=feature_transformer, + label_transform_fun=label_transform_fun) _, model_params, train_params = \ grid_search(experiment, diff --git a/takepod/models/__init__.py b/takepod/models/__init__.py index e367f31f..6d24792e 100644 --- a/takepod/models/__init__.py +++ b/takepod/models/__init__.py @@ -2,9 +2,12 @@ from .model import AbstractFrameworkModel, AbstractSupervisedModel from .batch_transform_functions import default_feature_transform, default_label_transform +from .transformers import FeatureTransformer, TensorTransformer, \ + SklearnTensorTransformerWrapper from .experiment import Experiment from .trainer import AbstractTrainer __all__ = ["AbstractFrameworkModel", "AbstractSupervisedModel", "default_feature_transform", "default_label_transform", "Experiment", - "AbstractTrainer"] + "AbstractTrainer", "FeatureTransformer", "TensorTransformer", + "SklearnTensorTransformerWrapper"] diff --git a/takepod/models/experiment.py b/takepod/models/experiment.py index 560e5983..15503fd3 100644 --- a/takepod/models/experiment.py +++ b/takepod/models/experiment.py @@ -1,26 +1,29 @@ """Modules defines an experiment - class used to combine iteration over data, model training and prediction.""" -from typing import Callable, NamedTuple, Dict, Type +from typing import Callable, NamedTuple, Dict, Type, Union +from inspect import isclass +import logging import numpy as np from takepod.datasets.dataset import Dataset from takepod.datasets.iterator import Iterator, SingleBatchIterator -from takepod.models import AbstractSupervisedModel,\ - default_feature_transform, default_label_transform +from takepod.models import AbstractSupervisedModel, \ + default_feature_transform, default_label_transform, FeatureTransformer from takepod.models.trainer import AbstractTrainer +_LOGGER = logging.getLogger(__name__) + class Experiment: """Class used to streamline model fitting and prediction.""" def __init__(self, - model_class: Type[AbstractSupervisedModel], - trainer: AbstractTrainer, - training_iterator_callable: Callable[[Dataset], Iterator], + model: Union[Type[AbstractSupervisedModel], AbstractSupervisedModel], + feature_transformer: FeatureTransformer = None, + trainer: AbstractTrainer = None, + training_iterator_callable: Callable[[Dataset], Iterator] = None, prediction_iterator_callable: Callable[[Dataset], Iterator] = None, - feature_transform_fun: - Callable[[NamedTuple], np.ndarray] = None, label_transform_fun: Callable[[NamedTuple], np.ndarray] = None ): @@ -29,8 +32,11 @@ def __init__(self, Parameters ---------- - model_class : class - Class of the Model to be fitted. + model : class or model instance + Class of the Model to be fitted or a pre-trained model. + If pre-trained model is passed and `fit` is called a new model instance will + be created. For fine-tuning of the passed model instance call + `partial_fit`. Must be a subclass of Podium's `AbstractSupervisedModel` trainer : AbstractTrainer @@ -46,9 +52,10 @@ def __init__(self, a single tensor before being returned. If passed None, a SingleBatchIterator will be used as a default. - feature_transform_fun : Callable[[NamedTuple], np.ndarray] - Callable that transforms the input part of the batch returned by the iterator - into features that can be fed into the model. + feature_transformer : FeatureTransformer + FeatureTransformer that transforms the input part of the batch returned by the + iterator into features that can be fed into the model. Will also be fitted + during Experiment fitting. label_transform_fun : Callable[[NamedTuple], np.ndarray] Callable that transforms the target part of the batch returned by the iterator @@ -56,8 +63,13 @@ def __init__(self, the prediction result of the model for some examples must be identical to the result of this callable for those same examples. """ - self.model_class = model_class - self.model = None + if isclass(model): + self.model_class = model + self.model = None + else: + self.model_class = model.__class__ + self.model = model + self.trainer = trainer self.training_iterator_callable = training_iterator_callable @@ -72,9 +84,9 @@ def default_prediction_iterator_callable(dataset): else: self.prediction_iterator_callable = prediction_iterator_callable - self.feature_transform_fun = feature_transform_fun \ - if feature_transform_fun is not None \ - else default_feature_transform + self.feature_transformer = feature_transformer \ + if feature_transformer is not None \ + else FeatureTransformer(default_feature_transform) self.label_transform_fun = label_transform_fun \ if label_transform_fun is not None \ @@ -107,7 +119,10 @@ def set_default_trainer_args(self, **kwargs): def fit(self, dataset: Dataset, model_kwargs: Dict = None, - trainer_kwargs: Dict = None + trainer_kwargs: Dict = None, + feature_transformer: FeatureTransformer = None, + trainer: AbstractTrainer = None, + training_iterator_callable: Callable[[Dataset], Iterator] = None, ): """Fits the model to the provided Dataset. During fitting, the provided Iterator and Trainer are used. @@ -126,25 +141,108 @@ def fit(self, Dict containing trainer arguments. Arguments passed to the trainer are the default arguments defined with `set_default_trainer_args` updated/overridden by 'trainer_kwargs'. + + feature_transformer : FeatureTransformer, Optional + FeatureTransformer that transforms the input part of the batch returned by the + iterator into features that can be fed into the model. Will also be fitted + during Experiment fitting. + If None, the default FeatureTransformer provided in the constructor will be + used. Otherwise, this will overwrite the default feature transformer. + + trainer : AbstractTrainer, Optional + Trainer used to fit the model. If None, the trainer provided in the + constructor will be used. + + training_iterator_callable: Callable[[Dataset], Iterator] + Callable used to instantiate new instances of the Iterator used in fitting the + model. If None, the training_iterator_callable provided in the + constructor will be used. """ + model_kwargs = {} if model_kwargs is None else model_kwargs - trainer_kwargs = {} if trainer_kwargs is None else trainer_kwargs model_args = self.default_model_args.copy() model_args.update(model_kwargs) - trainer_args = self.default_trainer_args.copy() - trainer_args.update(trainer_kwargs) + trainer = trainer if trainer is not None else self.trainer + if trainer is None: + errmsg = "No trainer provided. Trainer must be provided either in the " \ + "constructor or as an argument to the fit method." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) + + if feature_transformer is not None: + self.feature_transformer = feature_transformer + + # Fit the feature transformer if it needs fitting + if self.feature_transformer.requires_fitting(): + x_batch, y_batch = next(SingleBatchIterator(dataset).__iter__()) + y = self.label_transform_fun(y_batch) + self.feature_transformer.fit(x_batch, y) # Create new model instance self.model = self.model_class(**model_args) - train_iterator = self.training_iterator_callable(dataset) - self.trainer.train(self.model, - train_iterator, - self.feature_transform_fun, - self.label_transform_fun, - **trainer_args) + # Train the model + self.partial_fit(dataset, + trainer_kwargs, + trainer, + training_iterator_callable) + + def partial_fit(self, + dataset: Dataset, + trainer_kwargs: Dict = None, + trainer: AbstractTrainer = None, + training_iterator_callable: Callable[[Dataset], Iterator] = None): + """Fits the model to the data without resetting the model. + + Parameters + ---------- + dataset : Dataset + Dataset to fit the model to. + + trainer_kwargs : dict + Dict containing trainer arguments. Arguments passed to the trainer are the + default arguments defined with `set_default_trainer_args` updated/overridden + by 'trainer_kwargs'. + + trainer : AbstractTrainer, Optional + Trainer used to fit the model. If None, the trainer provided in the + constructor will be used. + + training_iterator_callable: Callable[[Dataset], Iterator] + Callable used to instantiate new instances of the Iterator used in fitting the + model. If None, the training_iterator_callable provided in the + constructor will be used. + + Returns + ------- + + """ + self._check_if_model_exists() + + trainer = trainer if trainer is not None else self.trainer + if trainer is None: + errmsg = "No trainer provided. Trainer must be provided either in the " \ + "constructor or as an argument to the partial_fit method." + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) + + trainer_kwargs = {} if trainer_kwargs is None else trainer_kwargs + trainer_args = self.default_trainer_args.copy() + trainer_args.update(trainer_kwargs) + + training_iterator_callable = training_iterator_callable \ + if training_iterator_callable is not None \ + else self.training_iterator_callable + + train_iterator = training_iterator_callable(dataset) + + trainer.train(self.model, + train_iterator, + self.feature_transformer, + self.label_transform_fun, + **trainer_args) def predict(self, dataset: Dataset, @@ -167,13 +265,21 @@ def predict(self, """ # TODO: new method of providing examples must be defined. # examples is taken in dataset form as proof-of-concept. + self._check_if_model_exists() y = [] for x_batch, _ in self.prediction_iterator_callable(dataset): - x_batch_tensor = self.feature_transform_fun(x_batch) + x_batch_tensor = self.feature_transformer.transform(x_batch) batch_prediction = self.model.predict(x_batch_tensor, **kwargs) prediction_tensor = batch_prediction[AbstractSupervisedModel.PREDICTION_KEY] y.append(prediction_tensor) return np.concatenate(y) + + def _check_if_model_exists(self): + if self.model is None: + errmsg = "Model instance not available. Please provide a model instance in " \ + "the constructor or call `fit` before calling `partial_fit.`" + _LOGGER.error(errmsg) + raise RuntimeError(errmsg) diff --git a/takepod/models/impl/simple_trainers.py b/takepod/models/impl/simple_trainers.py index 2de8617d..947e3d4d 100644 --- a/takepod/models/impl/simple_trainers.py +++ b/takepod/models/impl/simple_trainers.py @@ -16,13 +16,13 @@ class SimpleTrainer(AbstractTrainer): def train(self, model, iterator, - feature_transform_fun, + feature_transformer, label_transform_fun, **kwargs): self._check_kwargs(**kwargs) for _ in range(kwargs[SimpleTrainer.MAX_EPOCH_KEY]): for x_batch, y_batch in iterator: - x = feature_transform_fun(x_batch) + x = feature_transformer.transform(x_batch) y = label_transform_fun(y_batch) model.fit(X=x, y=y) diff --git a/takepod/models/trainer.py b/takepod/models/trainer.py index e1c71cdc..5c8ec39a 100644 --- a/takepod/models/trainer.py +++ b/takepod/models/trainer.py @@ -4,7 +4,7 @@ import numpy as np -from takepod.models import AbstractSupervisedModel +from takepod.models import AbstractSupervisedModel, FeatureTransformer from takepod.datasets import Iterator @@ -15,8 +15,7 @@ class AbstractTrainer(ABC): def train(self, model: AbstractSupervisedModel, iterator: Iterator, - feature_transform_fun: - Callable[[NamedTuple], np.ndarray], + feature_transformer: FeatureTransformer, label_transform_fun: Callable[[NamedTuple], np.ndarray], **kwargs): @@ -27,7 +26,7 @@ def train(self, The model that needs to be trained. iterator : Iterator Iterator instance that provides data from a dataset - feature_transform_fun: Callable[[NamedTuple], np.ndarray] + feature_transformer: Callable[[NamedTuple], np.ndarray] Callable that transforms the input part of the batch returned by the iterator into features that can be fed into the model. label_transform_fun: Callable[[NamedTuple], np.ndarray] diff --git a/takepod/models/transformers.py b/takepod/models/transformers.py new file mode 100644 index 00000000..6cfe18c3 --- /dev/null +++ b/takepod/models/transformers.py @@ -0,0 +1,166 @@ +from abc import ABC, abstractmethod +from typing import Callable, NamedTuple + +import numpy as np + + +class TensorTransformer(ABC): + """Abstract class used to transform tensors. Used in feature pre-processing during + training and prediction. Usually used in FeatureTransformer to transform tensors + returned by the feature extraction callable. + """ + + @abstractmethod + def fit(self, + x: np.ndarray, + y: np.ndarray): + """Fits the transformer to the provided data. + + Parameters + ---------- + + x: np.ndarray + Features in numpy array form. + y: np.ndarray + Labels in numpy array form. + """ + pass + + @abstractmethod + def transform(self, + x: np.ndarray + ) -> np.ndarray: + """Transforms the passed features. + + Parameters + ---------- + x: np.ndarray + Features to be transformed in numpy array form. + + Returns + ------- + np.array + Transformed features.""" + pass + + @abstractmethod + def requires_fitting(self) -> bool: + """Returns True if this TensorTransformer requires fitting. + + Returns + ------- + True if this TensorTransformer requires fitting, else returns False. + """ + pass + + +class SklearnTensorTransformerWrapper(TensorTransformer): + """Wrapper class for Sklearn feature transformers.""" + + def __init__(self, + feature_transformer, + requires_fitting=True): + """Creates a new SklearnTensorTransformerWrapper. + + Parameters + ---------- + feature_transformer + The sklearn feature transformer to be wrapped. Example of this would be + a sklean pipeline containing a sequence of transformations. + + requires_fitting: bool + Whether this tensor transformer should be fitted. + """ + self.feature_transformer = feature_transformer + self.requires_fitting_flag = requires_fitting + + def fit(self, + x: np.ndarray, + y: np.ndarray): + if self.requires_fitting(): + self.feature_transformer.fit(x, y) + + def transform(self, + x: np.array + ) -> np.ndarray: + return self.feature_transformer.transform(x) + + def requires_fitting(self) -> bool: + return self.requires_fitting_flag + + +class FeatureTransformer: + """Class used to transform Dataset batches into features used in model prediction and + training.""" + + def __init__(self, + feature_extraction_fn: Callable[[NamedTuple], np.ndarray], + tensor_transformer: TensorTransformer = None): + """Creates a new FeatureTransformer. + + Parameters + ---------- + feature_extraction_fn: Callable[[NamedTuple], np.ndarray] + Callable that takes a podium feature batch as an argument and returns a + numpy array representing the data. + + tensor_transformer: TensorTransformer + TensorTransformer used to transform the transform the tensors provided by the + `feature_extraction_fn` callable. + """ + self.feature_extraction_fn = feature_extraction_fn + self.tensor_transformer = tensor_transformer + + def fit(self, + x: NamedTuple, + y: np.ndarray): + """Fits this tensor transformer to the provided data. + + Parameters + ---------- + x: NamedTuple + Podium feature batch containing the features to be transformed. + + y: np.ndarray + Labels corresponding to the features in `x`. + """ + if not self.requires_fitting(): + return + + x_tensor = self.feature_extraction_fn(x) + self.tensor_transformer.fit(x_tensor, y) + + def transform(self, + x: NamedTuple) -> np.ndarray: + """ + Trasforms the provided podium feature batch into a numpy array. + Parameters + ---------- + x: NamedTuple + Feature batch to be transformed. + + Returns + ------- + np.ndarray + Transformed features. + """ + x_tensor = self.feature_extraction_fn(x) + if self.tensor_transformer is None: + return x_tensor + + else: + return self.tensor_transformer.transform(x_tensor) + + def requires_fitting(self): + """Returns True if the contained TensorTransformer exists and requires fitting, + else returns None. + + Returns + ------- + True if the contained TensorTransformer exists and requires fitting, + else returns False. + ------- + + """ + return self.tensor_transformer is not None \ + and self.tensor_transformer.requires_fitting() diff --git a/takepod/pipeline/__init__.py b/takepod/pipeline/__init__.py new file mode 100644 index 00000000..d2381c3a --- /dev/null +++ b/takepod/pipeline/__init__.py @@ -0,0 +1,3 @@ +from .pipeline import Pipeline + +__all__ = ["Pipeline"] diff --git a/takepod/pipeline/pipeline.py b/takepod/pipeline/pipeline.py new file mode 100644 index 00000000..897eedfb --- /dev/null +++ b/takepod/pipeline/pipeline.py @@ -0,0 +1,114 @@ +from typing import Union, Dict, List, Callable, NamedTuple, Any, Type +import logging + +from takepod.storage import ExampleFactory, ExampleFormat +from takepod.datasets import Dataset, Iterator +from takepod.models import AbstractSupervisedModel, FeatureTransformer, Experiment, \ + AbstractTrainer +import numpy as np + +_LOGGER = logging.getLogger(__name__) + + +class Pipeline(Experiment): + """Class used to streamline the use of Podium. It contains all components needed to + train or fine-tune a pre-configured model and make predictions on new data.""" + + def __init__(self, + fields: Union[Dict, List], + example_format: ExampleFormat, + feature_transformer: FeatureTransformer, + model: Union[AbstractSupervisedModel, Type[AbstractSupervisedModel]], + trainer: AbstractTrainer = None, + trainer_iterator_callable: Callable[[Dataset], Iterator] = None, + label_transform_fn: Callable[[NamedTuple], np.ndarray] = None + ): + """Creates a new pipeline instance. + + Parameters + ---------- + fields : dict or list of fields + Fields used to process raw data. Can be either a dict mapping column names + to Fields (or tuples of Fields), or a list of Fields (or tuples of Fields). + A Field value of None means the corresponding column will + be ignored. + + example_format: ExampleFormat + Format of expected raw examples. + + feature_transformer: FeatureTransformer + FeatureTransformer used to transform data features from the podium "batch" + format into numpy arrays. Will be fitted along with the model to the provided + data. + + model : class or model instance + Class of the Model to be fitted or a pre-trained model. + If pre-trained model is passed and `fit` is called a new model instance will + be created. For fine-tuning of the passed model instance call + `partial_fit`. + Must be a subclass of Podium's `AbstractSupervisedModel` + + trainer: AbstractTrainer, Optional + Trainer used to fit the model. If provided, this trainer instance will be + stored in the pipeline and used as the default trainer if no trainer is + provided in the `fit` and `partial_fit` methods. + + trainer_iterator_callable: Callable[[Dataset], Iterator] + Callable used to instantiate new instances of the Iterator used in fitting the + model. + + label_transform_fn: Callable[[NamedTuple], np.ndarray] + Callable that transforms the target part of the batch returned by the iterator + into the same format the model prediction is. For a hypothetical perfect model + the prediction result of the model for some examples must be identical to the + result of this callable for those same examples. + + """ + if example_format in (ExampleFormat.LIST, ExampleFormat.CSV, ExampleFormat.NLTK): + if not isinstance(fields, (list, tuple)): + error_msg = "If example format is LIST, CSV or NLTK, `fields`" \ + "must be either a list or tuple. " \ + "Type of `fields`: {}".format(type(fields)) + _LOGGER.error(error_msg) + raise TypeError(error_msg) + elif not isinstance(fields, dict): + error_msg = "If example format is DICT, XML or JSON, `fields`" \ + "must be a dict. " \ + "Type of `fields`: {}".format(type(fields)) + _LOGGER.error(error_msg) + raise TypeError(error_msg) + + self.fields = fields + self.example_format = example_format + self.example_factory = ExampleFactory(fields) + + super().__init__(model, + feature_transformer=feature_transformer, + trainer=trainer, + training_iterator_callable=trainer_iterator_callable, + label_transform_fun=label_transform_fn) + + def predict_raw(self, + raw_example: Any, + **kwargs) -> np.ndarray: + """Computes the prediction of the model for the one example. + The example must be of the format provided in the constructor as the + `example_format` parameter. + + Parameters + ---------- + raw_example: Any + Example to compute the prediction for. + + kwargs + Keyword arguments passed to the model's `predict` method + + Returns + ------- + ndarray + Tensor containing the prediction for the example.""" + processed_example = self.example_factory.from_format(raw_example, + self.example_format) + ds = Dataset([processed_example], self.fields) + + return self.predict(ds, **kwargs) diff --git a/takepod/storage/__init__.py b/takepod/storage/__init__.py index dd98daf4..1ce432f5 100644 --- a/takepod/storage/__init__.py +++ b/takepod/storage/__init__.py @@ -1,6 +1,6 @@ """Package contains modules for storing and loading datasets and vectors.""" -from .example_factory import ExampleFactory +from .example_factory import ExampleFactory, ExampleFormat from .field import Field, TokenizedField, MultilabelField, MultioutputField, unpack_fields from .resources.downloader import (BaseDownloader, SCPDownloader, HttpDownloader, SimpleHttpDownloader) @@ -22,4 +22,4 @@ "Field", "TokenizedField", "MultilabelField", "MultioutputField", "unpack_fields", "LargeResource", "SCPLargeResource", "VectorStorage", "BasicVectorStorage", "SpecialVocabSymbols", "Vocab", - "ExampleFactory", "TfIdfVectorizer"] + "ExampleFactory", "ExampleFormat", "TfIdfVectorizer"] diff --git a/takepod/storage/example_factory.py b/takepod/storage/example_factory.py index f5766712..768619c8 100644 --- a/takepod/storage/example_factory.py +++ b/takepod/storage/example_factory.py @@ -4,6 +4,7 @@ import logging import json import csv +from enum import Enum import xml.etree.ElementTree as ET from takepod.storage.field import unpack_fields @@ -11,6 +12,15 @@ _LOGGER = logging.getLogger(__name__) +class ExampleFormat(Enum): + LIST = lambda data, factory: factory.from_list(data) # noqa: E731 + DICT = lambda data, factory: factory.from_dict(data) # noqa: E731 + CSV = lambda data, factory: factory.from_csv(data) # noqa: E731 + NLTK = lambda data, factory: factory.from_fields_tree(data) # noqa: E731 + XML = lambda data, factory: factory.from_xml_str(data) # noqa: E731 + JSON = lambda data, factory: factory.from_json(data) # noqa: E731 + + class Example: """Method models one example with fields that hold (raw, tokenized) values and special fields with "_" @@ -152,7 +162,7 @@ def from_xml_str(self, data): node = root else: error_msg = "Specified name {} was not found in the " \ - "input data".format(name) + "input data".format(name) _LOGGER.error(error_msg) raise ValueError(error_msg) @@ -251,6 +261,11 @@ def from_fields_tree(self, data, subtrees=False): else: return self.from_list(tree_to_list(tree)) + def from_format(self, + data, + format_tag: ExampleFormat): + return format_tag(data, self) + def tree_to_list(tree): """Method joins tree leaves and label in one list. diff --git a/takepod/storage/field.py b/takepod/storage/field.py index 786f8c85..3032890b 100644 --- a/takepod/storage/field.py +++ b/takepod/storage/field.py @@ -518,7 +518,7 @@ def _process_tokens(self, data, tokens): data, tokens = self._run_posttokenization_hooks(data, tokens) - if self.eager and self.use_vocab: + if self.eager and self.use_vocab and not self.vocab.finalized: self.update_vocab(data, tokens) data = data if self.store_as_raw else None diff --git a/test/models/test_experiment.py b/test/models/test_experiment.py index 40065431..d50ca46c 100644 --- a/test/models/test_experiment.py +++ b/test/models/test_experiment.py @@ -1,22 +1,78 @@ from collections import namedtuple +import pytest import numpy as np from takepod.models import AbstractSupervisedModel, Experiment +from takepod.datasets import Dataset +from takepod.storage import Field, ExampleFactory, Vocab + + +def get_dataset(): + data = [{"Name": "Mark Dark", + "Score": 5}, + {"Name": "Stephen Smith", + "Score": 10}, + {"Name": "Ann Mann", + "Score": 15}] + + name_field = Field("Name", + vocab=Vocab(), + store_as_raw=True, + tokenizer="split") + + score_field = Field("Score", + custom_numericalize=int, + tokenize=False, + is_target=True) + + fields = {"Name": name_field, + "Score": score_field} + + example_factory = ExampleFactory(fields) + examples = [example_factory.from_dict(data_) for data_ in data] + + ds = Dataset(examples, fields) + ds.finalize_fields() + return ds + + +class MockDataset: + pass def mock_feature_transform_fun(x_batch): - return x_batch.input + return x_batch.Score def mock_label_transform_fun(y_batch): - return y_batch.output + return y_batch.Score -class MockDataset: +class MockTransformer: + + def __init__(self, to_fit): + self.to_fit = to_fit + self.fit_called = 0 + + def fit(self, x, y): + self.fit_called += 1 + pass + + def transform(self, x_batch): + return mock_feature_transform_fun(x_batch) + + def requires_fitting(self): + return self.to_fit + + +class MockIterator: pass -def test_experiment_train(): +@pytest.mark.parametrize("fit_transformer", (False, True)) +def test_experiment_train(fit_transformer): + dataset = get_dataset() + default_model_args = { 'm_arg1': 1, 'm_arg2': 2 @@ -49,22 +105,9 @@ def test_experiment_train(): 't_arg3': 4 } - class MockIterator: - input_batch_class = namedtuple("input_batch_class", ["input"]) - output_batch_class = namedtuple("output_batch_class", ["output"]) - - def __iter__(self): - x = np.array( - [ - [1, 2], - [3, 4] - ]) - - y = np.array([5, 6]) + mock_transformer = MockTransformer(fit_transformer) - input_batch = self.input_batch_class(input=x) - target_batch = self.output_batch_class(output=y) - yield input_batch, target_batch + my_iterator = MockIterator() class MockModel: def __init__(self, **kwargs): @@ -78,12 +121,12 @@ def __init__(self): def train(self, model, iterator, - feature_transform_fun, + feature_transformer, label_transform_fun, **kwargs): assert isinstance(model, MockModel) - assert isinstance(iterator, MockIterator) - assert feature_transform_fun is mock_feature_transform_fun + assert iterator is my_iterator + assert feature_transformer is mock_transformer assert label_transform_fun is mock_label_transform_fun assert kwargs == expected_trainer_args self.train_called += 1 @@ -91,19 +134,23 @@ def train(self, trainer = MockTrainer() experiment = Experiment(MockModel, - trainer, - lambda _: MockIterator(), - feature_transform_fun=mock_feature_transform_fun, + trainer=trainer, + training_iterator_callable=lambda _: my_iterator, + feature_transformer=mock_transformer, label_transform_fun=mock_label_transform_fun) experiment.set_default_model_args(**default_model_args) experiment.set_default_trainer_args(**default_trainer_args) - experiment.fit(MockDataset(), + experiment.fit(dataset, model_args, trainer_args) assert trainer.train_called == 1 + if fit_transformer: + assert mock_transformer.fit_called == 1 + else: + assert mock_transformer.fit_called == 0 def test_experiment_predict(): @@ -170,7 +217,7 @@ def train(self, model, iterator, feature_transform_fun=None, experiment = Experiment( MockModel, - MockTrainer(), + trainer=MockTrainer(), training_iterator_callable=lambda _: MockIterator(), prediction_iterator_callable=lambda _: MockIterator() diff --git a/test/models/test_simple_trainers.py b/test/models/test_simple_trainers.py index 9145c10a..92d1fa7c 100644 --- a/test/models/test_simple_trainers.py +++ b/test/models/test_simple_trainers.py @@ -3,6 +3,7 @@ from takepod.models.impl.simple_trainers import SimpleTrainer from takepod.models.model import AbstractSupervisedModel +from takepod.models import FeatureTransformer from takepod.datasets.iterator import Iterator from test.storage.conftest import (tabular_dataset, json_file_path) # noqa @@ -20,7 +21,7 @@ def test_simple_trainer_no_num_epoch(tabular_dataset, model): trainer = SimpleTrainer() trainer.train(model, iterator=iterator, - feature_transform_fun=lambda x: x, + feature_transformer=lambda x: x, label_transform_fun=lambda y: y) @@ -29,9 +30,10 @@ def test_simple_trainer_num_epoch(tabular_dataset, model): tabular_dataset.finalize_fields() iterator = Iterator(tabular_dataset, batch_size=len(tabular_dataset)) trainer = SimpleTrainer() + feature_transformer = FeatureTransformer(lambda x: x) trainer.train(model=model, iterator=iterator, - feature_transform_fun=lambda x: x, + feature_transformer=feature_transformer, label_transform_fun=lambda y: y, **{trainer.MAX_EPOCH_KEY: 10}) assert model.fit.call_count == 10 @@ -42,7 +44,7 @@ def mock_feature_transform_fun(x): def mock_label_transform_fun(y): - y + return y @pytest.mark.usefixtures("tabular_dataset", "mocker", "model") # noqa def test_simple_trainer_batch_transform_call(tabular_dataset, mocker, model): @@ -55,11 +57,12 @@ def test_simple_trainer_batch_transform_call(tabular_dataset, mocker, model): with mocker.patch( "test.models.test_simple_trainers.mock_label_transform_fun", return_value=next(iterator.__iter__())[1]): + feature_transformer = FeatureTransformer(mock_feature_transform_fun) trainer = SimpleTrainer() trainer.train( model=model, iterator=iterator, - feature_transform_fun=mock_feature_transform_fun, + feature_transformer=feature_transformer, label_transform_fun=mock_label_transform_fun, **{trainer.MAX_EPOCH_KEY: 10}) assert mock_feature_transform_fun.call_count == 10 # pylint: disable=E1101 diff --git a/test/models/test_transformers.py b/test/models/test_transformers.py new file mode 100644 index 00000000..e5d511be --- /dev/null +++ b/test/models/test_transformers.py @@ -0,0 +1,106 @@ +from collections import namedtuple + +import numpy as np + +from takepod.models import FeatureTransformer, TensorTransformer, \ + SklearnTensorTransformerWrapper + + +class MockTensorTransformer(TensorTransformer): + + def __init__(self, requires_fitting): + self.requires_fitting_flag = requires_fitting + + def fit(self, + x: np.ndarray, + y: np.ndarray): + pass + + def transform(self, + x: np.array + ) -> np.ndarray: + return [3, 4] + + def requires_fitting(self) -> bool: + return self.requires_fitting_flag + + +def test_feature_transformer(mocker): + mock_batch_class = namedtuple("Mock_feature_batch", ("mock_feature",)) + + def mock_feature_extraction_fn(x): + return x.mock_feature + + mock_tensor_transformer = MockTensorTransformer(requires_fitting=True) + + mocker.spy(mock_tensor_transformer, 'fit') + mocker.spy(mock_tensor_transformer, 'transform') + + feature_transformer = FeatureTransformer(mock_feature_extraction_fn, + mock_tensor_transformer) + + mock_feature_batch = mock_batch_class(mock_feature=[1, 2]) + y = [3, 4] + + feature_transformer.fit(mock_feature_batch, y) + mock_tensor_transformer.fit.assert_called_once_with([1, 2], [3, 4]) + + mock_feature_batch_2 = mock_batch_class(mock_feature=[4, 5]) + assert np.all(feature_transformer.transform(mock_feature_batch_2) == [3, 4]) + mock_tensor_transformer.transform.assert_called_once_with([4, 5]) + + mock_tensor_transformer_no_fit = MockTensorTransformer(requires_fitting=False) + mocker.spy(mock_tensor_transformer_no_fit, 'fit') + mocker.spy(mock_tensor_transformer_no_fit, 'transform') + feature_transformer = FeatureTransformer(mock_feature_extraction_fn, + mock_tensor_transformer_no_fit) + + feature_transformer.fit(mock_feature_batch, y) + mock_tensor_transformer_no_fit.fit.assert_not_called() + assert np.all(feature_transformer.transform(mock_feature_batch_2) == [3, 4]) + mock_tensor_transformer_no_fit.transform.assert_called_once_with([4, 5]) + + +def test_sklearn_feature_transformer_wrapper(mocker): + class MockSklearnTransformer: + + def fit(self, x, y): + pass + + def transform(self, x): + return x + 1 + + # test with fitting + tensor_transformer = MockSklearnTransformer() + mocker.spy(tensor_transformer, 'fit') + mocker.spy(tensor_transformer, 'transform') + + mock_feature_batch = np.array([[1, 2, 3]]) + mock_label_batch = np.array([[2, 3, 4]]) + + wrapper = SklearnTensorTransformerWrapper(tensor_transformer, requires_fitting=True) + + assert wrapper.requires_fitting() + + wrapper.fit(mock_feature_batch, mock_label_batch) + tensor_transformer.fit.assert_called_once_with(mock_feature_batch, mock_label_batch) + + result = tensor_transformer.transform(mock_feature_batch) + tensor_transformer.transform.assert_called_once_with(mock_feature_batch) + assert np.all(result == mock_feature_batch + 1) + + # test without fitting + tensor_transformer = MockSklearnTransformer() + mocker.spy(tensor_transformer, 'fit') + mocker.spy(tensor_transformer, 'transform') + + wrapper = SklearnTensorTransformerWrapper(tensor_transformer, requires_fitting=False) + + assert not wrapper.requires_fitting() + + wrapper.fit(mock_feature_batch, mock_label_batch) + tensor_transformer.fit.assert_not_called() + + result = tensor_transformer.transform(mock_feature_batch) + tensor_transformer.transform.assert_called_once_with(mock_feature_batch) + assert np.all(result == mock_feature_batch + 1) diff --git a/test/pipeline/__init__.py b/test/pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py new file mode 100644 index 00000000..0bdcf57a --- /dev/null +++ b/test/pipeline/test_pipeline.py @@ -0,0 +1,89 @@ +import numpy as np + +from takepod.pipeline import Pipeline +from takepod.storage import Field, ExampleFormat +from takepod.models import AbstractSupervisedModel + + +def get_fields(): + name_dict = { + "Marko": 1, + "Darko": 2, + "Ivana": 3 + } + + name_field = Field("Name", custom_numericalize=name_dict.get) + score_field = Field("Score", tokenize=False, custom_numericalize=int) + + name_field.finalize() + score_field.finalize() + + return {"Name": name_field, + "Score": score_field} + + +mock_data = [ + ["Marko", 50], + ["Darko", 60], + ["Ivana", 45] +] + + +class MockModel: + + def fit(self, *args, **kwargs): + pass + + def predict(self, x, **kwargs): + return {AbstractSupervisedModel.PREDICTION_KEY: x} + + +class MockTrainer: + def train(self, *args, **kwargs): + pass + + +class MockFeatureTransformer: + + def transform(self, x_batch): + return np.hstack((x_batch.Name, x_batch.Score)) + + +def test_pipeline_from_raw(): + fields = get_fields() + + # Test for list format + fields_list = [fields['Name'], fields['Score']] + list_pipeline = Pipeline(fields_list, + ExampleFormat.LIST, + feature_transformer=MockFeatureTransformer(), + model=MockModel()) + + raw_list = ["Marko", 30] + expected_prediction = np.array([[1, 30]]) + prediction = list_pipeline.predict_raw(raw_list) + + assert np.all(expected_prediction == prediction) + + fields_dict = {field.name: field for field in fields_list} + dict_pipeline = Pipeline(fields_dict, + ExampleFormat.DICT, + feature_transformer=MockFeatureTransformer(), + model=MockModel()) + + # Test for Dict format + raw_dict = {'Name': "Marko", 'Score': 30} + expected_prediction = np.array([[1, 30]]) + prediction = dict_pipeline.predict_raw(raw_dict) + + assert np.all(expected_prediction == prediction) + + # Test for csv + raw_csv = "Marko,30" + csv_pipeline = Pipeline(fields_list, + ExampleFormat.CSV, + feature_transformer=MockFeatureTransformer(), + model=MockModel()) + expected_prediction = np.array([[1, 30]]) + prediction = csv_pipeline.predict_raw(raw_csv) + assert np.all(expected_prediction == prediction) diff --git a/test/storage/test_example_factory.py b/test/storage/test_example_factory.py index 367a1401..b23fb3ac 100644 --- a/test/storage/test_example_factory.py +++ b/test/storage/test_example_factory.py @@ -1,6 +1,6 @@ import pytest -from takepod.storage import ExampleFactory, Field +from takepod.storage import ExampleFactory, Field, ExampleFormat name_field = Field("Name", store_as_raw=True, @@ -397,3 +397,25 @@ def test_cache_data_field_from_dict(expected_values): assert hasattr(example, field_name) assert hasattr(example, "{}_".format(field_name)) + + +def test_from_format(): + list_example_factory = ExampleFactory(field_list) + + list_data = ["Mark Dark", 5, "Hawaiian pizza"] + example = list_example_factory.from_format(list_data, ExampleFormat.LIST) + + assert example.Name[0] == list_data[0] + assert example.Score[0] == list_data[1] + assert example.Favorite_food[0] == list_data[2] + + dict_example_factory = ExampleFactory(field_dict) + dict_data = {"Name": "Mark Dark", + "Score": 5, + "Favorite_food": "Hawaiian pizza"} + + example = dict_example_factory.from_format(dict_data, ExampleFormat.DICT) + assert example.Name[0] == dict_data["Name"] + assert example.Score[0] == dict_data["Score"] + assert example.Favorite_food[0] == dict_data["Favorite_food"] + # TODO extend testing to other formats?