diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 2db74080..4fce53bf 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -172,24 +172,26 @@ The process of releasing a new version involves several steps combining both ``g 1. Merge what is in ``master`` branch into ``stable`` branch. 2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files. -3. Create a new TAG pointing at the correspoding commit in ``stable`` branch. +3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. 4. Merge the new commit from ``stable`` into ``master``. -5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` to open the next - development interation. +5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` + to open the next development iteration. -**Note:** Before starting the process, make sure that ``HISTORY.md`` has a section titled -**Unreleased** with the list of changes that will be included in the new version, and that -these changes are committed and available in ``master`` branch. -Normally this is just a list of the Pull Requests that have been merged since the latest version. +.. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new + entry that explains the changes that will be included in the new version. + Normally this is just a list of the Pull Requests that have been merged to master + since the last release. -Once this is done, just run the following commands:: +Once this is done, run of the following commands: + +1. If you are releasing a patch version:: - git checkout stable - git merge --no-ff master # This creates a merge commit - bumpversion release # This creates a new commit and a TAG - git push --tags origin stable make release - git checkout master - git merge stable - bumpversion --no-tag patch - git push + +2. If you are releasing a minor version:: + + make release-minor + +3. If you are releasing a major version:: + + make release-major diff --git a/Makefile b/Makefile index dc62e90d..6266033f 100644 --- a/Makefile +++ b/Makefile @@ -98,6 +98,11 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort autopep8 --in-place --recursive --aggressive tests isort --apply --atomic --recursive tests +.PHONY: lint-docs +lint-docs: ## check docs formatting with doc8 and pydocstyle + doc8 mlblocks/ + pydocstyle mlblocks/ + # TEST TARGETS @@ -122,7 +127,6 @@ coverage: ## check code coverage quickly with the default Python .PHONY: docs docs: clean-docs ## generate Sphinx HTML documentation, including API docs $(MAKE) -C docs html - touch docs/_build/html/.nojekyll .PHONY: view-docs view-docs: docs ## view docs in browser diff --git a/docs/changelog.rst b/docs/changelog.rst index fcd2eb2d..d26e5be8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1 +1 @@ -.. include:: ../HISTORY.md +.. mdinclude:: ../HISTORY.md diff --git a/docs/conf.py b/docs/conf.py index 8659996f..95653914 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,18 +18,9 @@ # relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. -import os -import sys - import sphinx_rtd_theme # For read the docs theme -from recommonmark.parser import CommonMarkParser -# from recommonmark.transform import AutoStructify - -# sys.path.insert(0, os.path.abspath('..')) import mlblocks -# -# mlblocks.add_primitives_path('../mlblocks_primitives') # -- General configuration --------------------------------------------- @@ -40,13 +31,21 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.napoleon', + 'm2r', + 'sphinx.ext.autodoc', 'sphinx.ext.githubpages', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', 'sphinx.ext.graphviz', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', + 'autodocsumm', ] +autodoc_default_options = { + 'autosummary': True, +} + ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] # Add any paths that contain templates here, relative to this directory. @@ -56,10 +55,6 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md', '.ipynb'] -source_parsers = { - '.md': CommonMarkParser, -} - # The master toctree document. master_doc = 'index' diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index 2e00ece6..2115fcef 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -24,7 +24,7 @@ them to the `MLPipeline class`_: from mlblocks import MLPipeline primitives = [ - 'mlprimitives.feature_extraction.StringVectorizer', + 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] pipeline = MLPipeline(primitives) diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst index 5503e739..54ef85a1 100644 --- a/docs/pipeline_examples/graph.rst +++ b/docs/pipeline_examples/graph.rst @@ -39,7 +39,7 @@ additional information not found inside `X`. primitives = [ 'networkx.link_prediction_feature_extraction', - 'mlprimitives.feature_extraction.CategoricalEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 'sklearn.preprocessing.StandardScaler', 'xgboost.XGBClassifier' ] @@ -69,6 +69,6 @@ additional information not found inside `X`. .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html -.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.feature_extraction.CategoricalEncoder.json +.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst index df8a9d5a..03472ea3 100644 --- a/docs/pipeline_examples/text.rst +++ b/docs/pipeline_examples/text.rst @@ -40,31 +40,31 @@ for later ones. # set up the pipeline primitives = [ - "mlprimitives.counters.UniqueCounter", - "mlprimitives.text.TextCleaner", - "mlprimitives.counters.VocabularyCounter", + "mlprimitives.custom.counters.UniqueCounter", + "mlprimitives.custom.text.TextCleaner", + "mlprimitives.custom.counters.VocabularyCounter", "keras.preprocessing.text.Tokenizer", "keras.preprocessing.sequence.pad_sequences", "keras.Sequential.LSTMTextClassifier" ] input_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "X": "y" } } output_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "counts": "classes" }, - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "counts": "vocabulary_size" } } init_params = { - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "add": 1 }, - "mlprimitives.text.TextCleaner#1": { + "mlprimitives.custom.text.TextCleaner#1": { "language": "en" }, "keras.preprocessing.sequence.pad_sequences#1": { @@ -116,12 +116,12 @@ to encode all the string features, and go directly into the nltk.download('stopwords') primitives = [ - 'mlprimitives.text.TextCleaner', - 'mlprimitives.feature_extraction.StringVectorizer', + 'mlprimitives.custom.text.TextCleaner', + 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] init_params = { - 'mlprimitives.text.TextCleaner': { + 'mlprimitives.custom.text.TextCleaner': { 'column': 'text', 'language': 'nl' }, diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py index b5ed6b46..0c69afda 100644 --- a/mlblocks/datasets.py +++ b/mlblocks/datasets.py @@ -100,6 +100,7 @@ class Dataset(): **kwargs: Any additional keyword argument passed on initialization will be made available as instance attributes. """ + def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs): self.name = description.splitlines()[0] @@ -115,10 +116,10 @@ def __init__(self, description, data, target, score, shuffle=True, stratify=Fals self.__dict__.update(kwargs) def score(self, *args, **kwargs): - """Scoring function for this dataset. + r"""Scoring function for this dataset. Args: - \\*args, \\*\\*kwargs: Any given arguments and keyword arguments will be + \*args, \*\*kwargs: Any given arguments and keyword arguments will be directly passed to the given scoring function. Returns: @@ -141,7 +142,7 @@ def _get_split(data, index): else: return data[index] - def get_splits(self, n_splits=1): + def get_splits(self, n_splits=1, random_state=0): """Return splits of this dataset ready for Cross Validation. If n_splits is 1, a tuple containing the X for train and test @@ -166,12 +167,13 @@ def get_splits(self, n_splits=1): self.data, self.target, shuffle=self._shuffle, - stratify=stratify + stratify=stratify, + random_state=random_state ) else: cv_class = StratifiedKFold if self._stratify else KFold - cv = cv_class(n_splits=n_splits, shuffle=self._shuffle) + cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state) splits = list() for train, test in cv.split(self.data, self.target): @@ -314,7 +316,6 @@ def load_dic28(): There exist 52,652 words (vertices in a network) having 2 up to 8 characters in the dictionary. The obtained network has 89038 edges. """ - dataset_path = _load('dic28') X = _load_csv(dataset_path, 'data') @@ -343,7 +344,6 @@ def load_nomination(): Data consists of one graph whose nodes contain two attributes, attr1 and attr2. Associated with each node is a label that has to be learned and predicted. """ - dataset_path = _load('nomination') X = _load_csv(dataset_path, 'data') @@ -362,7 +362,6 @@ def load_amazon(): co-purchased with product j, the graph contains an undirected edge from i to j. Each product category provided by Amazon defines each ground-truth community. """ - dataset_path = _load('amazon') X = _load_csv(dataset_path, 'data') @@ -382,7 +381,6 @@ def load_jester(): source: "University of California Berkeley, CA" sourceURI: "http://eigentaste.berkeley.edu/dataset/" """ - dataset_path = _load('jester') X = _load_csv(dataset_path, 'data') @@ -392,7 +390,7 @@ def load_jester(): def load_wikiqa(): - """A Challenge Dataset for Open-Domain Question Answering. + """Challenge Dataset for Open-Domain Question Answering. WikiQA dataset is a publicly available set of question and sentence (QS) pairs, collected and annotated for research on open-domain question answering. @@ -400,7 +398,6 @@ def load_wikiqa(): source: "Microsoft" sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#" """ # noqa - dataset_path = _load('wikiqa') data = _load_csv(dataset_path, 'data', set_index=True) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index a5cdb6a4..c3878e68 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -25,32 +25,34 @@ class MLBlock(): as wrapping them and providing a common interface to run them. Attributes: - name (str): Name given to this MLBlock. - primitive (object): the actual function or instance which this MLBlock - wraps. - fit_args (dict): specification of the arguments expected by the `fit` - method. - fit_method (str): name of the primitive method to call on `fit`. - `None` if the primitive is a function. - produce_args (dict): specification of the arguments expected by the - `predict` method. - produce_output (dict): specification of the outputs of the `produce` - method. - produce_method (str): name of the primitive method to call on - `produce`. `None` if the primitive is a function. + name (str): + Name given to this MLBlock. + primitive (object): + the actual function or instance which this MLBlock wraps. + fit_args (dict): + specification of the arguments expected by the `fit` method. + fit_method (str): + name of the primitive method to call on `fit`. `None` if the primitive is a function. + produce_args (dict): + specification of the arguments expected by the `predict` method. + produce_output (dict): + specification of the outputs of the `produce` method. + produce_method (str): + name of the primitive method to call on `produce`. `None` if the primitive is a + function. Args: - name (str): Name given to this MLBlock. - **kwargs: Any additional arguments that will be used as - hyperparameters or passed to the `fit` or `produce` - methods. + name (str): + Name given to this MLBlock. + **kwargs: + Any additional arguments that will be used as hyperparameters or passed to the + `fit` or `produce` methods. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found within the `kwargs` or if an unexpected - argument has been given. - """ - # pylint: disable=too-many-instance-attributes + TypeError: + A `TypeError` is raised if a required argument is not found within the `kwargs` + or if an unexpected argument has been given. + """ # pylint: disable=too-many-instance-attributes def _extract_params(self, kwargs, hyperparameters): """Extract init, fit and produce params from kwargs. @@ -63,16 +65,16 @@ def _extract_params(self, kwargs, hyperparameters): have been given and that nothing unexpected exists in the input. Args: - kwargs (dict): dict containing the Keyword arguments that have - been passed to the `__init__` method upon - initialization. - hyperparameters (dict): hyperparameters dictionary, as found in - the JSON annotation. + kwargs (dict): + dict containing the Keyword arguments that have been passed to the `__init__` + method upon initialization. + hyperparameters (dict): + hyperparameters dictionary, as found in the JSON annotation. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found in the `kwargs` dict, or if an unexpected - argument has been given. + TypeError: + A `TypeError` is raised if a required argument is not found in the `kwargs` dict, + or if an unexpected argument has been given. """ init_params = dict() fit_params = dict() @@ -138,7 +140,6 @@ def _get_tunable(cls, hyperparameters, init_params): return tunable def __init__(self, name, **kwargs): - self.name = name metadata = load_primitive(name) @@ -174,6 +175,7 @@ def __init__(self, name, **kwargs): self.set_hyperparameters(default) def __str__(self): + """Return a string that represents this block.""" return 'MLBlock - {}'.format(self.name) def get_tunable_hyperparameters(self): @@ -210,9 +212,9 @@ def set_hyperparameters(self, hyperparameters): If necessary, a new instance of the primitive is created. Args: - hyperparameters (dict): Dictionary containing as keys the name - of the hyperparameters and as values - the values to be used. + hyperparameters (dict): + Dictionary containing as keys the name of the hyperparameters and as + values the values to be used. """ self._hyperparameters.update(hyperparameters) @@ -233,12 +235,13 @@ def fit(self, **kwargs): the primitive is a simple function, this will be a noop. Args: - **kwargs: Any given keyword argument will be directly passed - to the primitive fit method. + **kwargs: + Any given keyword argument will be directly passed to the primitive fit method. Raises: - TypeError: A `TypeError` might be raised if any argument not - expected by the primitive fit method is given. + TypeError: + A `TypeError` might be raised if any argument not expected by the primitive fit + method is given. """ if self.fit_method is not None: fit_args = self._fit_params.copy() diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 058737ee..eddb442e 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -34,41 +34,39 @@ class MLPipeline(): results, which will be returned as the prediction of the pipeline. Attributes: - primitives (list): List of the names of the primitives that compose - this pipeline. - blocks (list): OrderedDict of the block names and the corresponding - MLBlock instances. - init_params (dict): init_params dictionary, as given when the instance - was created. - input_names (dict): input_names dictionary, as given when the instance - was created. - output_names (dict): output_names dictionary, as given when the instance - was created. + primitives (list): + List of the names of the primitives that compose this pipeline. + blocks (list): + OrderedDict of the block names and the corresponding MLBlock instances. + init_params (dict): + init_params dictionary, as given when the instance was created. + input_names (dict): + input_names dictionary, as given when the instance was created. + output_names (dict): + output_names dictionary, as given when the instance was created. Args: - primitives (list): List with the names of the primitives that will - compose this pipeline. - init_params (dict): dictionary containing initialization arguments to - be passed when creating the MLBlocks instances. - The dictionary keys must be the corresponding - primitive names and the values must be another - dictionary that will be passed as `**kargs` to the - MLBlock instance. - input_names (dict): dictionary that maps input variable names with the - actual names expected by each primitive. This - allows reusing the same input argument for multiple - primitives that name it differently, as well as - passing different values to primitives that expect - arguments named similary. - output_names (dict): dictionary that maps output variable names with - the name these variables will be given when stored - in the context dictionary. This allows storing - the output of different primitives in different - variables, even if the primitive output name is - the same one. + primitives (list): + List with the names of the primitives that will compose this pipeline. + init_params (dict): + dictionary containing initialization arguments to be passed when creating the + MLBlocks instances. The dictionary keys must be the corresponding primitive names + and the values must be another dictionary that will be passed as `**kargs` to the + MLBlock instance. + input_names (dict): + dictionary that maps input variable names with the actual names expected by each + primitive. This allows reusing the same input argument for multiple primitives that + name it differently, as well as passing different values to primitives that expect + arguments named similary. + output_names (dict): + dictionary that maps output variable names with the name these variables will be + given when stored in the context dictionary. This allows storing the output of + different primitives in different variables, even if the primitive output name is + the same one. """ def _get_tunable_hyperparameters(self): + """Get the tunable hyperperparameters from all the blocks in this pipeline.""" tunable = {} for block_name, block in self.blocks.items(): tunable[block_name] = block.get_tunable_hyperparameters() @@ -132,14 +130,33 @@ def set_hyperparameters(self, hyperparameters): """Set new hyperparameter values for some blocks. Args: - hyperparameters (dict): A dictionary containing the block names as - keys and the new hyperparameters dictionary - as values. + hyperparameters (dict): + A dictionary containing the block names as keys and the new hyperparameters + dictionary as values. """ for block_name, block_hyperparams in hyperparameters.items(): self.blocks[block_name].set_hyperparameters(block_hyperparams) def _get_block_args(self, block_name, block_args, context): + """Get the arguments expected by the block method from the context. + + The arguments will be taken from the context using both the method + arguments specification and the `input_names` given when the pipeline + was created. + + Args: + block_name (str): + Name of this block. Used to find the corresponding input_names. + block_args (list): + list of method argument specifications from the primitive. + context (dict): + current context dictionary. + + Returns: + dict: + A dictionary containing the argument names and values to pass + to the method. + """ # TODO: type validation and/or transformation should be done here input_names = self.input_names.get(block_name, dict()) @@ -166,7 +183,8 @@ def _get_block_args(self, block_name, block_args, context): return kwargs - def _get_outputs(self, block_name, outputs, block_outputs): + def _extract_outputs(self, block_name, outputs, block_outputs): + """Extract the outputs of the method as a dict to be set into the context.""" # TODO: type validation and/or transformation should be done here if not isinstance(outputs, tuple): @@ -188,7 +206,98 @@ def _get_outputs(self, block_name, outputs, block_outputs): return output_dict - def fit(self, X=None, y=None, **kwargs): + def _get_block_name(self, index): + """Get the name of the block in the `index` position.""" + return list(self.blocks.keys())[index] + + def _get_output_spec(self, output): + """Parse the output specification and get a block name and a variable name. + + The output specification can be of two types: int and str. + + If it is an integer, it is interpreted as a block index, and the variable name + is considered to be ``None``, which means that the whole context will be returned. + + If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + + Args: + output (str or int): + Output specification as either a string or an integer. + + Raises: + ValueError: + If the output string contains dots but it does not match a block + name exactly + + Returns: + tuple: + The output is a tuple containing: + * block_name (str): name of the block from which the output will be + returned, including its counter number. + * variable_name (str): Name of the variable to extract from the context. + It can be ``None``, which means that the whole context is to be + returned. + """ + # If None is given, both block and varialbe are None + if output is None: + return None, None + + # If an int is given, it is a block index and there is no variable + if isinstance(output, int): + output = self._get_block_name(output) + return output, None + + # If the string matches a block name, there is no variable + if output in self.blocks: + return output, None + + # If there is at least one dot in the output, but it did not match + # a block name, it is considered to be {block_name}.{variable_name} + if '.' in output: + output_block, output_variable = output.rsplit('.', 1) + if output_block not in self.blocks: + raise ValueError('Unknown block name: {}'.format(output_block)) + + return output_block, output_variable + + # If the given string is not a block name and it has no dots, + # it is considered to be a variable name to be extracted + # from the context after the last block has been produced + last_block_name = self._get_block_name(-1) + return last_block_name, output + + def _get_output(self, output_variable, context): + """Get the specified output variable from the context. + + If the variable name is ``None``, return the entire context. + """ + if output_variable: + if output_variable not in context: + raise ValueError('Output variable {} not found in context' + .format(output_variable)) + + return context[output_variable] + else: + return context + + def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): """Fit the blocks of this pipeline. Sequentially call the `fit` and the `produce` methods of each block, @@ -201,11 +310,58 @@ def fit(self, X=None, y=None, **kwargs): `produce` calls will be taken. Args: - X: Fit Data, which the pipeline will learn from. - y: Fit Data labels, which the pipeline will use to learn how to - behave. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. + X: + Fit Data, which the pipeline will learn from. + + y: + Fit Data labels, which the pipeline will use to learn how to + behave. + + output_ (str or int or None): + Output specification, which can be a string or an integer or None. + + * If it is None (default), nothing will be returned + * If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + * If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + + start_ (str or int or None): + Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. + + Returns: + None or dict or object: + * If no output is specified, nothing will be returned. + * If an output block has been specified without and output variable, the + context dictionary will be returned after the produce method of that block + has been called. + * If both an output block and an output variable have been specified, + the value of that variable from the context will extracted and returned + after the produce method of that block has been called. """ context = { 'X': X, @@ -213,8 +369,20 @@ def fit(self, X=None, y=None, **kwargs): } context.update(kwargs) - last_block_name = list(self.blocks.keys())[-1] + output_block, output_variable = self._get_output_spec(output_) + last_block_name = self._get_block_name(-1) + + if isinstance(start_, int): + start_ = self._get_block_name(start_) + for block_name, block in self.blocks.items(): + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug("Skipping block %s fit", block_name) + continue + LOGGER.debug("Fitting block %s", block_name) try: fit_args = self._get_block_args(block_name, block.fit_args, context) @@ -223,19 +391,26 @@ def fit(self, X=None, y=None, **kwargs): LOGGER.exception("Exception caught fitting MLBlock %s", block_name) raise - if block_name != last_block_name: + if (block_name != last_block_name) or (block_name == output_block): LOGGER.debug("Producing block %s", block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) outputs = block.produce(**produce_args) - output_dict = self._get_outputs(block_name, outputs, block.produce_output) + output_dict = self._extract_outputs(block_name, outputs, block.produce_output) context.update(output_dict) except Exception: LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - def predict(self, X=None, **kwargs): + if block_name == output_block: + return self._get_output(output_variable, context) + + if start_: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(start_)) + + def predict(self, X=None, output_=None, start_=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -247,31 +422,92 @@ def predict(self, X=None, **kwargs): will be taken. Args: - X: Data which the pipeline will use to make predictions. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. + X: + Data which the pipeline will use to make predictions. + + output_ (str or int or None): + Output specification, which can be a string or an integer or None. + * If it is None (default), the output of the last block will be returned. + * If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + * If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + + start_ (str or int or None): + Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. + + Returns: + None or dict or object: + * If no output is specified, the output of the last block will be returned. + * If an output block has been specified without and output variable, the + context dictionary will be returned after the produce method of that block + has been called. + * If both an output block and an output variable have been specified, + the value of that variable from the context will extracted and returned + after the produce method of that block has been called. """ context = { 'X': X } context.update(kwargs) - last_block_name = list(self.blocks.keys())[-1] + output_block, output_variable = self._get_output_spec(output_) + + if isinstance(start_, int): + start_ = self._get_block_name(start_) + for block_name, block in self.blocks.items(): + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug("Skipping block %s produce", block_name) + continue + LOGGER.debug("Producing block %s", block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) outputs = block.produce(**produce_args) + output_dict = self._extract_outputs(block_name, outputs, block.produce_output) + context.update(output_dict) - if block_name != last_block_name: - output_dict = self._get_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + if block_name == output_block: + return self._get_output(output_variable, context) except Exception: LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - return outputs + if start_: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(start_)) + + if output_ is None: + return outputs def to_dict(self): """Return all the details of this MLPipeline in a dict. @@ -328,7 +564,8 @@ def save(self, path): The content of the JSON file is the dict returned by the `to_dict` method. Args: - path (str): Path to the JSON file to write. + path (str): + Path to the JSON file to write. """ with open(path, 'w') as out_file: json.dump(self.to_dict(), out_file, indent=4) @@ -340,7 +577,8 @@ def from_dict(cls, metadata): The dict structure is the same as the one created by the `to_dict` method. Args: - metadata (dict): Dictionary containing the pipeline specification. + metadata (dict): + Dictionary containing the pipeline specification. Returns: MLPipeline: @@ -372,7 +610,8 @@ def load(cls, path): The JSON file format is the same as the one created by the `to_dict` method. Args: - path (str): Path of the JSON file to load. + path (str): + Path of the JSON file to load. Returns: MLPipeline: diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py index 9bca6a5d..f2300f67 100644 --- a/mlblocks/primitives.py +++ b/mlblocks/primitives.py @@ -37,6 +37,7 @@ def add_primitives_path(path): Raises: ValueError: A `ValueError` will be raised if the path is not valid. + """ if path not in _PRIMITIVES_PATHS: if not os.path.isdir(path): @@ -68,7 +69,6 @@ def get_primitives_paths(): list: The list of folders. """ - primitives_paths = list() entry_points = pkg_resources.iter_entry_points('mlprimitives') for entry_point in entry_points: @@ -99,7 +99,6 @@ def load_primitive(name): ValueError: A `ValueError` will be raised if the primitive cannot be found. """ - for base_path in get_primitives_paths(): parts = name.split('.') number_of_parts = len(parts) diff --git a/setup.cfg b/setup.cfg index e976dec7..17244565 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,13 +3,13 @@ current_version = 0.3.1-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release -values = +values = dev release @@ -46,3 +46,8 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y +[doc8] +max-line-length = 99 + +[pydocstyle] +add-ignore = D403,D413,D105,D107 diff --git a/setup.py b/setup.py index a8ac84d7..f355be93 100644 --- a/setup.py +++ b/setup.py @@ -40,10 +40,10 @@ 'm2r>=0.2.0', 'Sphinx>=1.7.1', 'sphinx_rtd_theme>=0.2.4', - 'graphviz==0.9', - 'ipython==6.5.0', - 'matplotlib==2.2.3', - 'recommonmark>=0.4.0', + 'graphviz>=0.9', + 'ipython>=6.5.0', + 'matplotlib>=2.2.3', + 'autodocsumm>=0.1.10', # style check 'flake8>=3.5.0', @@ -60,6 +60,10 @@ # Advanced testing 'tox>=2.9.1', 'coverage>=4.5.1', + + # Documentation style + 'doc8>=0.8.0', + 'pydocstyle>=3.0.0' ] diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py new file mode 100644 index 00000000..ce28d457 --- /dev/null +++ b/tests/features/test_partial_outputs.py @@ -0,0 +1,133 @@ +from unittest import TestCase +from unittest.mock import Mock + +import numpy as np + +from mlblocks.datasets import load_iris +from mlblocks.mlpipeline import MLPipeline + + +def almost_equal(obj1, obj2): + if isinstance(obj1, dict): + if not isinstance(obj2, dict): + raise AssertionError("{} is not equal to {}".format(type(obj2), dict)) + + for key, value in obj1.items(): + if key not in obj2: + raise AssertionError("{} not in {}".format(key, obj2)) + almost_equal(value, obj2[key]) + + else: + np.testing.assert_almost_equal(obj1, obj2) + + +class TestPartialOutputs(TestCase): + def setUp(self): + dataset = load_iris() + + self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1) + + def test_fit_output(self): + + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + int_block = 0 + invalid_int = 10 + str_block = 'sklearn.preprocessing.StandardScaler#1' + invalid_block = 'InvalidBlockName' + str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y' + invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' + + # Run + int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block) + str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block) + str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5], + output_=str_block_variable) + no_output = pipeline.fit(self.X_train, self.y_train) + + # Assert successful calls + X = np.array([ + [0.71269665, -1.45152899, 0.55344946, 0.31740553], + [0.26726124, 1.23648766, -1.1557327, -1.0932857], + [-1.95991577, 0.967686, -1.1557327, -1.0932857], + [0.71269665, -0.645124, 0.39067021, 0.31740553], + [0.26726124, -0.10752067, 1.36734573, 1.55176035] + ]) + y = np.array([1, 0, 0, 1, 2]) + context = { + 'X': X, + 'y': y + } + almost_equal(context, int_out) + almost_equal(context, str_out) + + almost_equal(y, str_out_variable) + + assert no_output is None + + # Run asserting exceptions + with self.assertRaises(IndexError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int) + + with self.assertRaises(ValueError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block) + + with self.assertRaises(ValueError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable) + + def test_fit_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X_train, + 'y': self.y_train + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.fit(start_=int_start, **context) + pipeline.fit(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.fit.assert_not_called() + + def test_predict_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + pipeline.fit(self.X_train, self.y_train) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X_train, + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.predict(start_=int_start, **context) + pipeline.predict(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.predict.assert_not_called()