From d3cbee730139b2d0117a1de1474a581844505196 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 19 Apr 2019 13:38:02 +0200 Subject: [PATCH 01/11] Initial implementation to work with intermediate outputs --- mlblocks/mlpipeline.py | 82 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 11 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 058737ee..d5928b69 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -166,7 +166,7 @@ def _get_block_args(self, block_name, block_args, context): return kwargs - def _get_outputs(self, block_name, outputs, block_outputs): + def _extract_outputs(self, block_name, outputs, block_outputs): # TODO: type validation and/or transformation should be done here if not isinstance(outputs, tuple): @@ -188,7 +188,40 @@ def _get_outputs(self, block_name, outputs, block_outputs): return output_dict - def fit(self, X=None, y=None, **kwargs): + def _get_block_name(self, index): + return list(self.blocks.keys())[index] + + def _get_output_spec(self, output): + if output is None: + return None, None + + if isinstance(output, int): + output = self._get_block_name(output) + + if output in self.blocks: + return output, None + + if '.' in output: + output_block, output_variable = output.rsplit('.', 1) + if output_block not in self.blocks: + raise ValueError('Unknown block name: {}'.format(output_block)) + + return output_block, output_variable + + last_block_name = self._get_block_name(-1) + return last_block_name, output + + def _get_output(self, output_variable, context): + if output_variable: + if output_variable not in context: + raise ValueError('Output variable {} not found in context' + .format(output_variable)) + + return context[output_variable] + else: + return context + + def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): """Fit the blocks of this pipeline. Sequentially call the `fit` and the `produce` methods of each block, @@ -213,8 +246,19 @@ def fit(self, X=None, y=None, **kwargs): } context.update(kwargs) - last_block_name = list(self.blocks.keys())[-1] + output_block, output_variable = self._get_output_spec(output) + last_block_name = self._get_block_name(-1) + + if isinstance(skip_to, int): + skip_to = self._get_block_name(skip_to) + for block_name, block in self.blocks.items(): + if block_name == skip_to: + skip_to = False + elif skip_to: + LOGGER.debug("Skipping block %s fit", block_name) + continue + LOGGER.debug("Fitting block %s", block_name) try: fit_args = self._get_block_args(block_name, block.fit_args, context) @@ -223,19 +267,22 @@ def fit(self, X=None, y=None, **kwargs): LOGGER.exception("Exception caught fitting MLBlock %s", block_name) raise - if block_name != last_block_name: + if (block_name != last_block_name) or (block_name == output_block): LOGGER.debug("Producing block %s", block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) outputs = block.produce(**produce_args) - output_dict = self._get_outputs(block_name, outputs, block.produce_output) + output_dict = self._extract_outputs(block_name, outputs, block.produce_output) context.update(output_dict) except Exception: LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - def predict(self, X=None, **kwargs): + if block_name == output_block: + return self._get_output(output_variable, context) + + def predict(self, X=None, output='y', skip_to=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -256,22 +303,35 @@ def predict(self, X=None, **kwargs): } context.update(kwargs) - last_block_name = list(self.blocks.keys())[-1] + output_block, output_variable = self._get_output_spec(output) + + if isinstance(skip_to, int): + skip_to = self._get_block_name(skip_to) + for block_name, block in self.blocks.items(): + if block_name == skip_to: + skip_to = False + elif skip_to: + LOGGER.debug("Skipping block %s produce", block_name) + continue + LOGGER.debug("Producing block %s", block_name) try: produce_args = self._get_block_args(block_name, block.produce_args, context) outputs = block.produce(**produce_args) + output_dict = self._extract_outputs(block_name, outputs, block.produce_output) + context.update(output_dict) - if block_name != last_block_name: - output_dict = self._get_outputs(block_name, outputs, block.produce_output) - context.update(output_dict) + if block_name == output_block: + return self._get_output(output_variable, context) except Exception: LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - return outputs + if skip_to: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(skip_to)) def to_dict(self): """Return all the details of this MLPipeline in a dict. From 59fae909d44afb78005425c6c4a24de567391eb5 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:48:38 +0200 Subject: [PATCH 02/11] Update contributing guide to match the current release workflow --- CONTRIBUTING.rst | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 2db74080..4fce53bf 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -172,24 +172,26 @@ The process of releasing a new version involves several steps combining both ``g 1. Merge what is in ``master`` branch into ``stable`` branch. 2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files. -3. Create a new TAG pointing at the correspoding commit in ``stable`` branch. +3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. 4. Merge the new commit from ``stable`` into ``master``. -5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` to open the next - development interation. +5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` + to open the next development iteration. -**Note:** Before starting the process, make sure that ``HISTORY.md`` has a section titled -**Unreleased** with the list of changes that will be included in the new version, and that -these changes are committed and available in ``master`` branch. -Normally this is just a list of the Pull Requests that have been merged since the latest version. +.. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new + entry that explains the changes that will be included in the new version. + Normally this is just a list of the Pull Requests that have been merged to master + since the last release. -Once this is done, just run the following commands:: +Once this is done, run of the following commands: + +1. If you are releasing a patch version:: - git checkout stable - git merge --no-ff master # This creates a merge commit - bumpversion release # This creates a new commit and a TAG - git push --tags origin stable make release - git checkout master - git merge stable - bumpversion --no-tag patch - git push + +2. If you are releasing a minor version:: + + make release-minor + +3. If you are releasing a major version:: + + make release-major From e768037076387fcb9a33e494c9c89421f0c657a8 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:49:47 +0200 Subject: [PATCH 03/11] Update docs config --- Makefile | 1 - docs/changelog.rst | 2 +- docs/conf.py | 20 +++++++------------- setup.py | 1 - 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index dc62e90d..c2d2aaa4 100644 --- a/Makefile +++ b/Makefile @@ -122,7 +122,6 @@ coverage: ## check code coverage quickly with the default Python .PHONY: docs docs: clean-docs ## generate Sphinx HTML documentation, including API docs $(MAKE) -C docs html - touch docs/_build/html/.nojekyll .PHONY: view-docs view-docs: docs ## view docs in browser diff --git a/docs/changelog.rst b/docs/changelog.rst index fcd2eb2d..d26e5be8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1 +1 @@ -.. include:: ../HISTORY.md +.. mdinclude:: ../HISTORY.md diff --git a/docs/conf.py b/docs/conf.py index 8659996f..9b4595ec 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,18 +18,9 @@ # relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. -import os -import sys - import sphinx_rtd_theme # For read the docs theme -from recommonmark.parser import CommonMarkParser -# from recommonmark.transform import AutoStructify - -# sys.path.insert(0, os.path.abspath('..')) import mlblocks -# -# mlblocks.add_primitives_path('../mlblocks_primitives') # -- General configuration --------------------------------------------- @@ -40,8 +31,11 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.napoleon', + 'm2r', + 'sphinx.ext.autodoc', 'sphinx.ext.githubpages', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', 'sphinx.ext.graphviz', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', @@ -56,9 +50,9 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md', '.ipynb'] -source_parsers = { - '.md': CommonMarkParser, -} +# source_parsers = { +# '.md': CommonMarkParser, +# } # The master toctree document. master_doc = 'index' diff --git a/setup.py b/setup.py index a8ac84d7..f6991ab1 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,6 @@ 'graphviz==0.9', 'ipython==6.5.0', 'matplotlib==2.2.3', - 'recommonmark>=0.4.0', # style check 'flake8>=3.5.0', From 080580d45c9b47680fbc31d30aee4e8478292711 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:50:08 +0200 Subject: [PATCH 04/11] Remove spaces --- setup.cfg | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index e976dec7..62ced521 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,13 +3,13 @@ current_version = 0.3.1-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release -values = +values = dev release @@ -45,4 +45,3 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y - From e25fa6d3ac3af2f20b205ed73d91d28124bc8c16 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:50:32 +0200 Subject: [PATCH 05/11] ADd docstrings --- mlblocks/mlpipeline.py | 127 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 14 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index d5928b69..abbac922 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -69,6 +69,7 @@ class MLPipeline(): """ def _get_tunable_hyperparameters(self): + """Get the tunable hyperperparameters from all the blocks in this pipeline.""" tunable = {} for block_name, block in self.blocks.items(): tunable[block_name] = block.get_tunable_hyperparameters() @@ -140,6 +141,24 @@ def set_hyperparameters(self, hyperparameters): self.blocks[block_name].set_hyperparameters(block_hyperparams) def _get_block_args(self, block_name, block_args, context): + """Get the arguments expected by the block method from the context. + + The arguments will be taken from the context using both the method + arguments specification and the `input_names` given when the pipeline + was created. + + Args: + block_name (str): Name of this block. Used to find the corresponding + input_names. + block_args (list): list of method argument specifications from the + primitive. + context (dict): current context dictionary. + + Returns: + dict: + A dictionary containing the argument names and values to pass + to the method. + """ # TODO: type validation and/or transformation should be done here input_names = self.input_names.get(block_name, dict()) @@ -167,6 +186,7 @@ def _get_block_args(self, block_name, block_args, context): return kwargs def _extract_outputs(self, block_name, outputs, block_outputs): + """Extract the outputs of the method as a dict to be set into the context.""" # TODO: type validation and/or transformation should be done here if not isinstance(outputs, tuple): @@ -189,9 +209,36 @@ def _extract_outputs(self, block_name, outputs, block_outputs): return output_dict def _get_block_name(self, index): + """Get the name of the block in the `index` position.""" return list(self.blocks.keys())[index] def _get_output_spec(self, output): + """Parsre the output specification and get a block name and a variable name. + + The output specification can be of two types: int and str. + + If it is an integer, it is interpreted as a block index, and the variable name + is considered to be ``None``, which means that the whole context will be returned. + + If it is a string, it is interpreted as the block name, and it has to match a block + name exactly, including its hash and counter number ``#n``. Optionally, a variable + name can be passed at the end using a ``'.'`` as a separator. + In this case, the format of the string is `{block_name}.{variable_name}`. Note + that the block name can also contain dots, so only the leftmost dot will be + considered, and only if the complete string does not match exactly a block name. + + Args: + output (str or int): Output specification as either a string or an integer. + + Returns: + tuple: + The output is a tuple containing: + * block_name (str): name of the block from which the output will be + returned, including its counter number. + * variable_name (str): Name of the variable to extract from the context. + It can be ``None``, which means that the whole context is to be + returned. + """ if output is None: return None, None @@ -212,6 +259,10 @@ def _get_output_spec(self, output): return last_block_name, output def _get_output(self, output_variable, context): + """Get the specified output variable from the context. + + If the variable name is ``None``, return the entire context. + """ if output_variable: if output_variable not in context: raise ValueError('Output variable {} not found in context' @@ -221,7 +272,7 @@ def _get_output(self, output_variable, context): else: return context - def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): + def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): """Fit the blocks of this pipeline. Sequentially call the `fit` and the `produce` methods of each block, @@ -237,8 +288,32 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): X: Fit Data, which the pipeline will learn from. y: Fit Data labels, which the pipeline will use to learn how to behave. + output (str or int): Output specification, which can be a string or an integer. + If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + If a string is given, it is expected to be the name of one block, including + its counter number at the end. Optionally, a variable name can be included + at the end after the counter number using a ``'.'`` as a separator between the + block name and the variable name. If the variable name is given, this will be + extracted from the context and returned. Otherwise, the whole context will + be returned. + start_on (str or int): Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. **kwargs: Any additional keyword arguments will be directly added to the context dictionary and available for the blocks. + + Returns: + None or dict or object: + * If no output is specified, nothing will be returned. + * If an output block has been specified without and output variable, the + context dictionary will be returned after the produce method of that block + has been called. + * If both an output block and an output variable have been specified, + the value of that variable from the context will extracted and returned + after the produce method of that block has been called. """ context = { 'X': X, @@ -249,13 +324,13 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): output_block, output_variable = self._get_output_spec(output) last_block_name = self._get_block_name(-1) - if isinstance(skip_to, int): - skip_to = self._get_block_name(skip_to) + if isinstance(start_on, int): + start_on = self._get_block_name(start_on) for block_name, block in self.blocks.items(): - if block_name == skip_to: - skip_to = False - elif skip_to: + if block_name == start_on: + start_on = False + elif start_on: LOGGER.debug("Skipping block %s fit", block_name) continue @@ -282,7 +357,7 @@ def fit(self, X=None, y=None, output=None, skip_to=None, **kwargs): if block_name == output_block: return self._get_output(output_variable, context) - def predict(self, X=None, output='y', skip_to=None, **kwargs): + def predict(self, X=None, output='y', start_on=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -295,8 +370,32 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs): Args: X: Data which the pipeline will use to make predictions. + output (str or int): Output specification, which can be a string or an integer. + If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + If a string is given, it is expected to be the name of one block, including + its counter number at the end. Optionally, a variable name can be included + at the end after the counter number using a ``'.'`` as a separator between the + block name and the variable name. If the variable name is given, this will be + extracted from the context and returned. Otherwise, the whole context will + be returned. + start_on (str or int): Block index or block name to start processing from. The + value can either be an integer, which will be interpreted as a block index, + or the name of a block, including the conter number at the end. + If given, the execution of the pipeline will start on the specified block, + and all the blocks before that one will be skipped. **kwargs: Any additional keyword arguments will be directly added to the context dictionary and available for the blocks. + + Returns: + None or dict or object: + * If no output is specified, the output of the last block will be returned. + * If an output block has been specified without and output variable, the + context dictionary will be returned after the produce method of that block + has been called. + * If both an output block and an output variable have been specified, + the value of that variable from the context will extracted and returned + after the produce method of that block has been called. """ context = { 'X': X @@ -305,13 +404,13 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs): output_block, output_variable = self._get_output_spec(output) - if isinstance(skip_to, int): - skip_to = self._get_block_name(skip_to) + if isinstance(start_on, int): + start_on = self._get_block_name(start_on) for block_name, block in self.blocks.items(): - if block_name == skip_to: - skip_to = False - elif skip_to: + if block_name == start_on: + start_on = False + elif start_on: LOGGER.debug("Skipping block %s produce", block_name) continue @@ -329,9 +428,9 @@ def predict(self, X=None, output='y', skip_to=None, **kwargs): LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - if skip_to: + if start_on: # We skipped all the blocks up to the end - raise ValueError('Unknown block name: {}'.format(skip_to)) + raise ValueError('Unknown block name: {}'.format(start_on)) def to_dict(self): """Return all the details of this MLPipeline in a dict. From 5e9be7aa7188d38ca6eafb684c24171b9e61f322 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 6 May 2019 22:51:09 +0200 Subject: [PATCH 06/11] Update primitive names to match the latest versions of MLPrimitives --- docs/getting_started/quickstart.rst | 2 +- docs/pipeline_examples/graph.rst | 4 ++-- docs/pipeline_examples/text.rst | 22 +++++++++++----------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index 2e00ece6..2115fcef 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -24,7 +24,7 @@ them to the `MLPipeline class`_: from mlblocks import MLPipeline primitives = [ - 'mlprimitives.feature_extraction.StringVectorizer', + 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] pipeline = MLPipeline(primitives) diff --git a/docs/pipeline_examples/graph.rst b/docs/pipeline_examples/graph.rst index 5503e739..54ef85a1 100644 --- a/docs/pipeline_examples/graph.rst +++ b/docs/pipeline_examples/graph.rst @@ -39,7 +39,7 @@ additional information not found inside `X`. primitives = [ 'networkx.link_prediction_feature_extraction', - 'mlprimitives.feature_extraction.CategoricalEncoder', + 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 'sklearn.preprocessing.StandardScaler', 'xgboost.XGBClassifier' ] @@ -69,6 +69,6 @@ additional information not found inside `X`. .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html -.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.feature_extraction.CategoricalEncoder.json +.. _CategoricalEncoder from MLPrimitives: https://github.com/HDI-Project/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn diff --git a/docs/pipeline_examples/text.rst b/docs/pipeline_examples/text.rst index df8a9d5a..03472ea3 100644 --- a/docs/pipeline_examples/text.rst +++ b/docs/pipeline_examples/text.rst @@ -40,31 +40,31 @@ for later ones. # set up the pipeline primitives = [ - "mlprimitives.counters.UniqueCounter", - "mlprimitives.text.TextCleaner", - "mlprimitives.counters.VocabularyCounter", + "mlprimitives.custom.counters.UniqueCounter", + "mlprimitives.custom.text.TextCleaner", + "mlprimitives.custom.counters.VocabularyCounter", "keras.preprocessing.text.Tokenizer", "keras.preprocessing.sequence.pad_sequences", "keras.Sequential.LSTMTextClassifier" ] input_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "X": "y" } } output_names = { - "mlprimitives.counters.UniqueCounter#1": { + "mlprimitives.custom.counters.UniqueCounter#1": { "counts": "classes" }, - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "counts": "vocabulary_size" } } init_params = { - "mlprimitives.counters.VocabularyCounter#1": { + "mlprimitives.custom.counters.VocabularyCounter#1": { "add": 1 }, - "mlprimitives.text.TextCleaner#1": { + "mlprimitives.custom.text.TextCleaner#1": { "language": "en" }, "keras.preprocessing.sequence.pad_sequences#1": { @@ -116,12 +116,12 @@ to encode all the string features, and go directly into the nltk.download('stopwords') primitives = [ - 'mlprimitives.text.TextCleaner', - 'mlprimitives.feature_extraction.StringVectorizer', + 'mlprimitives.custom.text.TextCleaner', + 'mlprimitives.custom.feature_extraction.StringVectorizer', 'sklearn.ensemble.RandomForestClassifier', ] init_params = { - 'mlprimitives.text.TextCleaner': { + 'mlprimitives.custom.text.TextCleaner': { 'column': 'text', 'language': 'nl' }, From 9f0ae6a3fa000896d8f530b72f6da46d23c31e4b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 7 May 2019 17:12:33 +0200 Subject: [PATCH 07/11] Add random state to datasets get_splits --- mlblocks/datasets.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py index b5ed6b46..fb32df9c 100644 --- a/mlblocks/datasets.py +++ b/mlblocks/datasets.py @@ -141,7 +141,7 @@ def _get_split(data, index): else: return data[index] - def get_splits(self, n_splits=1): + def get_splits(self, n_splits=1, random_state=0): """Return splits of this dataset ready for Cross Validation. If n_splits is 1, a tuple containing the X for train and test @@ -166,12 +166,13 @@ def get_splits(self, n_splits=1): self.data, self.target, shuffle=self._shuffle, - stratify=stratify + stratify=stratify, + random_state=random_state ) else: cv_class = StratifiedKFold if self._stratify else KFold - cv = cv_class(n_splits=n_splits, shuffle=self._shuffle) + cv = cv_class(n_splits=n_splits, shuffle=self._shuffle, random_state=random_state) splits = list() for train, test in cv.split(self.data, self.target): From 5aea64755b7b7f9b4e68f6faa9a0912c1a55033a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 7 May 2019 17:12:58 +0200 Subject: [PATCH 08/11] Rename output and start arguments --- mlblocks/mlpipeline.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index abbac922..91e44341 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -272,7 +272,7 @@ def _get_output(self, output_variable, context): else: return context - def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): + def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): """Fit the blocks of this pipeline. Sequentially call the `fit` and the `produce` methods of each block, @@ -288,7 +288,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): X: Fit Data, which the pipeline will learn from. y: Fit Data labels, which the pipeline will use to learn how to behave. - output (str or int): Output specification, which can be a string or an integer. + output_ (str or int): Output specification, which can be a string or an integer. If an integer is given, it is interpreted as the block number, and the whole context after running the specified block will be returned. If a string is given, it is expected to be the name of one block, including @@ -297,7 +297,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): block name and the variable name. If the variable name is given, this will be extracted from the context and returned. Otherwise, the whole context will be returned. - start_on (str or int): Block index or block name to start processing from. The + start_ (str or int): Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, @@ -321,16 +321,16 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): } context.update(kwargs) - output_block, output_variable = self._get_output_spec(output) + output_block, output_variable = self._get_output_spec(output_) last_block_name = self._get_block_name(-1) - if isinstance(start_on, int): - start_on = self._get_block_name(start_on) + if isinstance(start_, int): + start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_on: - start_on = False - elif start_on: + if block_name == start_: + start_ = False + elif start_: LOGGER.debug("Skipping block %s fit", block_name) continue @@ -357,7 +357,7 @@ def fit(self, X=None, y=None, output=None, start_on=None, **kwargs): if block_name == output_block: return self._get_output(output_variable, context) - def predict(self, X=None, output='y', start_on=None, **kwargs): + def predict(self, X=None, output_='y', start_=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -370,7 +370,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): Args: X: Data which the pipeline will use to make predictions. - output (str or int): Output specification, which can be a string or an integer. + output_ (str or int): Output specification, which can be a string or an integer. If an integer is given, it is interpreted as the block number, and the whole context after running the specified block will be returned. If a string is given, it is expected to be the name of one block, including @@ -379,7 +379,7 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): block name and the variable name. If the variable name is given, this will be extracted from the context and returned. Otherwise, the whole context will be returned. - start_on (str or int): Block index or block name to start processing from. The + start_ (str or int): Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, @@ -402,15 +402,15 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): } context.update(kwargs) - output_block, output_variable = self._get_output_spec(output) + output_block, output_variable = self._get_output_spec(output_) - if isinstance(start_on, int): - start_on = self._get_block_name(start_on) + if isinstance(start_, int): + start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_on: - start_on = False - elif start_on: + if block_name == start_: + start_ = False + elif start_: LOGGER.debug("Skipping block %s produce", block_name) continue @@ -428,9 +428,9 @@ def predict(self, X=None, output='y', start_on=None, **kwargs): LOGGER.exception("Exception caught producing MLBlock %s", block_name) raise - if start_on: + if start_: # We skipped all the blocks up to the end - raise ValueError('Unknown block name: {}'.format(start_on)) + raise ValueError('Unknown block name: {}'.format(start_)) def to_dict(self): """Return all the details of this MLPipeline in a dict. From 4607b3898aa9767774f872b936f2311492179746 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 7 May 2019 17:13:12 +0200 Subject: [PATCH 09/11] Add unit tests for partial outputs feature --- tests/features/test_partial_outputs.py | 133 +++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 tests/features/test_partial_outputs.py diff --git a/tests/features/test_partial_outputs.py b/tests/features/test_partial_outputs.py new file mode 100644 index 00000000..ce28d457 --- /dev/null +++ b/tests/features/test_partial_outputs.py @@ -0,0 +1,133 @@ +from unittest import TestCase +from unittest.mock import Mock + +import numpy as np + +from mlblocks.datasets import load_iris +from mlblocks.mlpipeline import MLPipeline + + +def almost_equal(obj1, obj2): + if isinstance(obj1, dict): + if not isinstance(obj2, dict): + raise AssertionError("{} is not equal to {}".format(type(obj2), dict)) + + for key, value in obj1.items(): + if key not in obj2: + raise AssertionError("{} not in {}".format(key, obj2)) + almost_equal(value, obj2[key]) + + else: + np.testing.assert_almost_equal(obj1, obj2) + + +class TestPartialOutputs(TestCase): + def setUp(self): + dataset = load_iris() + + self.X_train, self.X_test, self.y_train, self.y_test = dataset.get_splits(1) + + def test_fit_output(self): + + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + int_block = 0 + invalid_int = 10 + str_block = 'sklearn.preprocessing.StandardScaler#1' + invalid_block = 'InvalidBlockName' + str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y' + invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' + + # Run + int_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=int_block) + str_out = pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=str_block) + str_out_variable = pipeline.fit(self.X_train[0:5], self.y_train[0:5], + output_=str_block_variable) + no_output = pipeline.fit(self.X_train, self.y_train) + + # Assert successful calls + X = np.array([ + [0.71269665, -1.45152899, 0.55344946, 0.31740553], + [0.26726124, 1.23648766, -1.1557327, -1.0932857], + [-1.95991577, 0.967686, -1.1557327, -1.0932857], + [0.71269665, -0.645124, 0.39067021, 0.31740553], + [0.26726124, -0.10752067, 1.36734573, 1.55176035] + ]) + y = np.array([1, 0, 0, 1, 2]) + context = { + 'X': X, + 'y': y + } + almost_equal(context, int_out) + almost_equal(context, str_out) + + almost_equal(y, str_out_variable) + + assert no_output is None + + # Run asserting exceptions + with self.assertRaises(IndexError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_int) + + with self.assertRaises(ValueError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_block) + + with self.assertRaises(ValueError): + pipeline.fit(self.X_train[0:5], self.y_train[0:5], output_=invalid_variable) + + def test_fit_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X_train, + 'y': self.y_train + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.fit(start_=int_start, **context) + pipeline.fit(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.fit.assert_not_called() + + def test_predict_start(self): + # Setup variables + primitives = [ + 'sklearn.preprocessing.StandardScaler', + 'sklearn.linear_model.LogisticRegression' + ] + pipeline = MLPipeline(primitives) + pipeline.fit(self.X_train, self.y_train) + + # Mock the first block + block_mock = Mock() + pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock + + # Run first block + context = { + 'X': self.X_train, + } + int_start = 1 + str_start = 'sklearn.linear_model.LogisticRegression#1' + + pipeline.predict(start_=int_start, **context) + pipeline.predict(start_=str_start, **context) + + # Assert that mock has not been called + block_mock.predict.assert_not_called() From 980794b67165e286d49cb81cf742ea44fd760365 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 9 May 2019 15:14:23 +0200 Subject: [PATCH 10/11] Improve docstrings and add toc in autogenerated API reference --- Makefile | 5 + docs/conf.py | 9 +- mlblocks/datasets.py | 12 +- mlblocks/mlblock.py | 79 +++++++------ mlblocks/mlpipeline.py | 256 +++++++++++++++++++++++++++-------------- mlblocks/primitives.py | 3 +- setup.cfg | 6 + setup.py | 4 + 8 files changed, 234 insertions(+), 140 deletions(-) diff --git a/Makefile b/Makefile index c2d2aaa4..6266033f 100644 --- a/Makefile +++ b/Makefile @@ -98,6 +98,11 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort autopep8 --in-place --recursive --aggressive tests isort --apply --atomic --recursive tests +.PHONY: lint-docs +lint-docs: ## check docs formatting with doc8 and pydocstyle + doc8 mlblocks/ + pydocstyle mlblocks/ + # TEST TARGETS diff --git a/docs/conf.py b/docs/conf.py index 9b4595ec..95653914 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,8 +39,13 @@ 'sphinx.ext.graphviz', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive', + 'autodocsumm', ] +autodoc_default_options = { + 'autosummary': True, +} + ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] # Add any paths that contain templates here, relative to this directory. @@ -50,10 +55,6 @@ # You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md', '.ipynb'] -# source_parsers = { -# '.md': CommonMarkParser, -# } - # The master toctree document. master_doc = 'index' diff --git a/mlblocks/datasets.py b/mlblocks/datasets.py index fb32df9c..0c69afda 100644 --- a/mlblocks/datasets.py +++ b/mlblocks/datasets.py @@ -100,6 +100,7 @@ class Dataset(): **kwargs: Any additional keyword argument passed on initialization will be made available as instance attributes. """ + def __init__(self, description, data, target, score, shuffle=True, stratify=False, **kwargs): self.name = description.splitlines()[0] @@ -115,10 +116,10 @@ def __init__(self, description, data, target, score, shuffle=True, stratify=Fals self.__dict__.update(kwargs) def score(self, *args, **kwargs): - """Scoring function for this dataset. + r"""Scoring function for this dataset. Args: - \\*args, \\*\\*kwargs: Any given arguments and keyword arguments will be + \*args, \*\*kwargs: Any given arguments and keyword arguments will be directly passed to the given scoring function. Returns: @@ -315,7 +316,6 @@ def load_dic28(): There exist 52,652 words (vertices in a network) having 2 up to 8 characters in the dictionary. The obtained network has 89038 edges. """ - dataset_path = _load('dic28') X = _load_csv(dataset_path, 'data') @@ -344,7 +344,6 @@ def load_nomination(): Data consists of one graph whose nodes contain two attributes, attr1 and attr2. Associated with each node is a label that has to be learned and predicted. """ - dataset_path = _load('nomination') X = _load_csv(dataset_path, 'data') @@ -363,7 +362,6 @@ def load_amazon(): co-purchased with product j, the graph contains an undirected edge from i to j. Each product category provided by Amazon defines each ground-truth community. """ - dataset_path = _load('amazon') X = _load_csv(dataset_path, 'data') @@ -383,7 +381,6 @@ def load_jester(): source: "University of California Berkeley, CA" sourceURI: "http://eigentaste.berkeley.edu/dataset/" """ - dataset_path = _load('jester') X = _load_csv(dataset_path, 'data') @@ -393,7 +390,7 @@ def load_jester(): def load_wikiqa(): - """A Challenge Dataset for Open-Domain Question Answering. + """Challenge Dataset for Open-Domain Question Answering. WikiQA dataset is a publicly available set of question and sentence (QS) pairs, collected and annotated for research on open-domain question answering. @@ -401,7 +398,6 @@ def load_wikiqa(): source: "Microsoft" sourceURI: "https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/#" """ # noqa - dataset_path = _load('wikiqa') data = _load_csv(dataset_path, 'data', set_index=True) diff --git a/mlblocks/mlblock.py b/mlblocks/mlblock.py index a5cdb6a4..c3878e68 100644 --- a/mlblocks/mlblock.py +++ b/mlblocks/mlblock.py @@ -25,32 +25,34 @@ class MLBlock(): as wrapping them and providing a common interface to run them. Attributes: - name (str): Name given to this MLBlock. - primitive (object): the actual function or instance which this MLBlock - wraps. - fit_args (dict): specification of the arguments expected by the `fit` - method. - fit_method (str): name of the primitive method to call on `fit`. - `None` if the primitive is a function. - produce_args (dict): specification of the arguments expected by the - `predict` method. - produce_output (dict): specification of the outputs of the `produce` - method. - produce_method (str): name of the primitive method to call on - `produce`. `None` if the primitive is a function. + name (str): + Name given to this MLBlock. + primitive (object): + the actual function or instance which this MLBlock wraps. + fit_args (dict): + specification of the arguments expected by the `fit` method. + fit_method (str): + name of the primitive method to call on `fit`. `None` if the primitive is a function. + produce_args (dict): + specification of the arguments expected by the `predict` method. + produce_output (dict): + specification of the outputs of the `produce` method. + produce_method (str): + name of the primitive method to call on `produce`. `None` if the primitive is a + function. Args: - name (str): Name given to this MLBlock. - **kwargs: Any additional arguments that will be used as - hyperparameters or passed to the `fit` or `produce` - methods. + name (str): + Name given to this MLBlock. + **kwargs: + Any additional arguments that will be used as hyperparameters or passed to the + `fit` or `produce` methods. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found within the `kwargs` or if an unexpected - argument has been given. - """ - # pylint: disable=too-many-instance-attributes + TypeError: + A `TypeError` is raised if a required argument is not found within the `kwargs` + or if an unexpected argument has been given. + """ # pylint: disable=too-many-instance-attributes def _extract_params(self, kwargs, hyperparameters): """Extract init, fit and produce params from kwargs. @@ -63,16 +65,16 @@ def _extract_params(self, kwargs, hyperparameters): have been given and that nothing unexpected exists in the input. Args: - kwargs (dict): dict containing the Keyword arguments that have - been passed to the `__init__` method upon - initialization. - hyperparameters (dict): hyperparameters dictionary, as found in - the JSON annotation. + kwargs (dict): + dict containing the Keyword arguments that have been passed to the `__init__` + method upon initialization. + hyperparameters (dict): + hyperparameters dictionary, as found in the JSON annotation. Raises: - TypeError: A `TypeError` is raised if a required argument is not - found in the `kwargs` dict, or if an unexpected - argument has been given. + TypeError: + A `TypeError` is raised if a required argument is not found in the `kwargs` dict, + or if an unexpected argument has been given. """ init_params = dict() fit_params = dict() @@ -138,7 +140,6 @@ def _get_tunable(cls, hyperparameters, init_params): return tunable def __init__(self, name, **kwargs): - self.name = name metadata = load_primitive(name) @@ -174,6 +175,7 @@ def __init__(self, name, **kwargs): self.set_hyperparameters(default) def __str__(self): + """Return a string that represents this block.""" return 'MLBlock - {}'.format(self.name) def get_tunable_hyperparameters(self): @@ -210,9 +212,9 @@ def set_hyperparameters(self, hyperparameters): If necessary, a new instance of the primitive is created. Args: - hyperparameters (dict): Dictionary containing as keys the name - of the hyperparameters and as values - the values to be used. + hyperparameters (dict): + Dictionary containing as keys the name of the hyperparameters and as + values the values to be used. """ self._hyperparameters.update(hyperparameters) @@ -233,12 +235,13 @@ def fit(self, **kwargs): the primitive is a simple function, this will be a noop. Args: - **kwargs: Any given keyword argument will be directly passed - to the primitive fit method. + **kwargs: + Any given keyword argument will be directly passed to the primitive fit method. Raises: - TypeError: A `TypeError` might be raised if any argument not - expected by the primitive fit method is given. + TypeError: + A `TypeError` might be raised if any argument not expected by the primitive fit + method is given. """ if self.fit_method is not None: fit_args = self._fit_params.copy() diff --git a/mlblocks/mlpipeline.py b/mlblocks/mlpipeline.py index 91e44341..eddb442e 100644 --- a/mlblocks/mlpipeline.py +++ b/mlblocks/mlpipeline.py @@ -34,38 +34,35 @@ class MLPipeline(): results, which will be returned as the prediction of the pipeline. Attributes: - primitives (list): List of the names of the primitives that compose - this pipeline. - blocks (list): OrderedDict of the block names and the corresponding - MLBlock instances. - init_params (dict): init_params dictionary, as given when the instance - was created. - input_names (dict): input_names dictionary, as given when the instance - was created. - output_names (dict): output_names dictionary, as given when the instance - was created. + primitives (list): + List of the names of the primitives that compose this pipeline. + blocks (list): + OrderedDict of the block names and the corresponding MLBlock instances. + init_params (dict): + init_params dictionary, as given when the instance was created. + input_names (dict): + input_names dictionary, as given when the instance was created. + output_names (dict): + output_names dictionary, as given when the instance was created. Args: - primitives (list): List with the names of the primitives that will - compose this pipeline. - init_params (dict): dictionary containing initialization arguments to - be passed when creating the MLBlocks instances. - The dictionary keys must be the corresponding - primitive names and the values must be another - dictionary that will be passed as `**kargs` to the - MLBlock instance. - input_names (dict): dictionary that maps input variable names with the - actual names expected by each primitive. This - allows reusing the same input argument for multiple - primitives that name it differently, as well as - passing different values to primitives that expect - arguments named similary. - output_names (dict): dictionary that maps output variable names with - the name these variables will be given when stored - in the context dictionary. This allows storing - the output of different primitives in different - variables, even if the primitive output name is - the same one. + primitives (list): + List with the names of the primitives that will compose this pipeline. + init_params (dict): + dictionary containing initialization arguments to be passed when creating the + MLBlocks instances. The dictionary keys must be the corresponding primitive names + and the values must be another dictionary that will be passed as `**kargs` to the + MLBlock instance. + input_names (dict): + dictionary that maps input variable names with the actual names expected by each + primitive. This allows reusing the same input argument for multiple primitives that + name it differently, as well as passing different values to primitives that expect + arguments named similary. + output_names (dict): + dictionary that maps output variable names with the name these variables will be + given when stored in the context dictionary. This allows storing the output of + different primitives in different variables, even if the primitive output name is + the same one. """ def _get_tunable_hyperparameters(self): @@ -133,9 +130,9 @@ def set_hyperparameters(self, hyperparameters): """Set new hyperparameter values for some blocks. Args: - hyperparameters (dict): A dictionary containing the block names as - keys and the new hyperparameters dictionary - as values. + hyperparameters (dict): + A dictionary containing the block names as keys and the new hyperparameters + dictionary as values. """ for block_name, block_hyperparams in hyperparameters.items(): self.blocks[block_name].set_hyperparameters(block_hyperparams) @@ -148,11 +145,12 @@ def _get_block_args(self, block_name, block_args, context): was created. Args: - block_name (str): Name of this block. Used to find the corresponding - input_names. - block_args (list): list of method argument specifications from the - primitive. - context (dict): current context dictionary. + block_name (str): + Name of this block. Used to find the corresponding input_names. + block_args (list): + list of method argument specifications from the primitive. + context (dict): + current context dictionary. Returns: dict: @@ -213,22 +211,40 @@ def _get_block_name(self, index): return list(self.blocks.keys())[index] def _get_output_spec(self, output): - """Parsre the output specification and get a block name and a variable name. + """Parse the output specification and get a block name and a variable name. The output specification can be of two types: int and str. If it is an integer, it is interpreted as a block index, and the variable name is considered to be ``None``, which means that the whole context will be returned. - If it is a string, it is interpreted as the block name, and it has to match a block - name exactly, including its hash and counter number ``#n``. Optionally, a variable - name can be passed at the end using a ``'.'`` as a separator. - In this case, the format of the string is `{block_name}.{variable_name}`. Note - that the block name can also contain dots, so only the leftmost dot will be - considered, and only if the complete string does not match exactly a block name. + If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. Args: - output (str or int): Output specification as either a string or an integer. + output (str or int): + Output specification as either a string or an integer. + + Raises: + ValueError: + If the output string contains dots but it does not match a block + name exactly Returns: tuple: @@ -239,15 +255,21 @@ def _get_output_spec(self, output): It can be ``None``, which means that the whole context is to be returned. """ + # If None is given, both block and varialbe are None if output is None: return None, None + # If an int is given, it is a block index and there is no variable if isinstance(output, int): output = self._get_block_name(output) + return output, None + # If the string matches a block name, there is no variable if output in self.blocks: return output, None + # If there is at least one dot in the output, but it did not match + # a block name, it is considered to be {block_name}.{variable_name} if '.' in output: output_block, output_variable = output.rsplit('.', 1) if output_block not in self.blocks: @@ -255,6 +277,9 @@ def _get_output_spec(self, output): return output_block, output_variable + # If the given string is not a block name and it has no dots, + # it is considered to be a variable name to be extracted + # from the context after the last block has been produced last_block_name = self._get_block_name(-1) return last_block_name, output @@ -285,25 +310,48 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): `produce` calls will be taken. Args: - X: Fit Data, which the pipeline will learn from. - y: Fit Data labels, which the pipeline will use to learn how to - behave. - output_ (str or int): Output specification, which can be a string or an integer. - If an integer is given, it is interpreted as the block number, and the whole - context after running the specified block will be returned. - If a string is given, it is expected to be the name of one block, including - its counter number at the end. Optionally, a variable name can be included - at the end after the counter number using a ``'.'`` as a separator between the - block name and the variable name. If the variable name is given, this will be - extracted from the context and returned. Otherwise, the whole context will - be returned. - start_ (str or int): Block index or block name to start processing from. The + X: + Fit Data, which the pipeline will learn from. + + y: + Fit Data labels, which the pipeline will use to learn how to + behave. + + output_ (str or int or None): + Output specification, which can be a string or an integer or None. + + * If it is None (default), nothing will be returned + * If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + * If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + + start_ (str or int or None): + Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. Returns: None or dict or object: @@ -328,11 +376,12 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_: - start_ = False - elif start_: - LOGGER.debug("Skipping block %s fit", block_name) - continue + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug("Skipping block %s fit", block_name) + continue LOGGER.debug("Fitting block %s", block_name) try: @@ -357,7 +406,11 @@ def fit(self, X=None, y=None, output_=None, start_=None, **kwargs): if block_name == output_block: return self._get_output(output_variable, context) - def predict(self, X=None, output_='y', start_=None, **kwargs): + if start_: + # We skipped all the blocks up to the end + raise ValueError('Unknown block name: {}'.format(start_)) + + def predict(self, X=None, output_=None, start_=None, **kwargs): """Produce predictions using the blocks of this pipeline. Sequentially call the `produce` method of each block, capturing the @@ -369,23 +422,43 @@ def predict(self, X=None, output_='y', start_=None, **kwargs): will be taken. Args: - X: Data which the pipeline will use to make predictions. - output_ (str or int): Output specification, which can be a string or an integer. - If an integer is given, it is interpreted as the block number, and the whole - context after running the specified block will be returned. - If a string is given, it is expected to be the name of one block, including - its counter number at the end. Optionally, a variable name can be included - at the end after the counter number using a ``'.'`` as a separator between the - block name and the variable name. If the variable name is given, this will be - extracted from the context and returned. Otherwise, the whole context will - be returned. - start_ (str or int): Block index or block name to start processing from. The + X: + Data which the pipeline will use to make predictions. + + output_ (str or int or None): + Output specification, which can be a string or an integer or None. + * If it is None (default), the output of the last block will be returned. + * If an integer is given, it is interpreted as the block number, and the whole + context after running the specified block will be returned. + * If it is a string, it can be interpreted in three ways: + + * **block name**: If the string matches a block name exactly, including + its hash and counter number ``#n`` at the end, the whole context will be + returned after that block is produced. + * **variable_name**: If the string does not match any block name and does + not contain any dot characted, ``'.'``, it will be considered a variable + name. In this case, the indicated variable will be extracted from the + context and returned after the last block has been produced. + * **block_name + variable_name**: If the complete string does not match a + block name but it contains at least one dot, ``'.'``, it will be split + in two parts on the last dot. If the first part of the string matches a + block name exactly, the second part of the string will be considered a + variable name, assuming the format ``{block_name}.{variable_name}``, and + the indicated variable will be extracted from the context and returned + after the block has been produced. Otherwise, if the extracted + ``block_name`` does not match a block name exactly, a ``ValueError`` + will be raised. + + start_ (str or int or None): + Block index or block name to start processing from. The value can either be an integer, which will be interpreted as a block index, or the name of a block, including the conter number at the end. If given, the execution of the pipeline will start on the specified block, and all the blocks before that one will be skipped. - **kwargs: Any additional keyword arguments will be directly added - to the context dictionary and available for the blocks. + + **kwargs: + Any additional keyword arguments will be directly added + to the context dictionary and available for the blocks. Returns: None or dict or object: @@ -408,11 +481,12 @@ def predict(self, X=None, output_='y', start_=None, **kwargs): start_ = self._get_block_name(start_) for block_name, block in self.blocks.items(): - if block_name == start_: - start_ = False - elif start_: - LOGGER.debug("Skipping block %s produce", block_name) - continue + if start_: + if block_name == start_: + start_ = False + else: + LOGGER.debug("Skipping block %s produce", block_name) + continue LOGGER.debug("Producing block %s", block_name) try: @@ -432,6 +506,9 @@ def predict(self, X=None, output_='y', start_=None, **kwargs): # We skipped all the blocks up to the end raise ValueError('Unknown block name: {}'.format(start_)) + if output_ is None: + return outputs + def to_dict(self): """Return all the details of this MLPipeline in a dict. @@ -487,7 +564,8 @@ def save(self, path): The content of the JSON file is the dict returned by the `to_dict` method. Args: - path (str): Path to the JSON file to write. + path (str): + Path to the JSON file to write. """ with open(path, 'w') as out_file: json.dump(self.to_dict(), out_file, indent=4) @@ -499,7 +577,8 @@ def from_dict(cls, metadata): The dict structure is the same as the one created by the `to_dict` method. Args: - metadata (dict): Dictionary containing the pipeline specification. + metadata (dict): + Dictionary containing the pipeline specification. Returns: MLPipeline: @@ -531,7 +610,8 @@ def load(cls, path): The JSON file format is the same as the one created by the `to_dict` method. Args: - path (str): Path of the JSON file to load. + path (str): + Path of the JSON file to load. Returns: MLPipeline: diff --git a/mlblocks/primitives.py b/mlblocks/primitives.py index 9bca6a5d..f2300f67 100644 --- a/mlblocks/primitives.py +++ b/mlblocks/primitives.py @@ -37,6 +37,7 @@ def add_primitives_path(path): Raises: ValueError: A `ValueError` will be raised if the path is not valid. + """ if path not in _PRIMITIVES_PATHS: if not os.path.isdir(path): @@ -68,7 +69,6 @@ def get_primitives_paths(): list: The list of folders. """ - primitives_paths = list() entry_points = pkg_resources.iter_entry_points('mlprimitives') for entry_point in entry_points: @@ -99,7 +99,6 @@ def load_primitive(name): ValueError: A `ValueError` will be raised if the primitive cannot be found. """ - for base_path in get_primitives_paths(): parts = name.split('.') number_of_parts = len(parts) diff --git a/setup.cfg b/setup.cfg index 62ced521..17244565 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,3 +45,9 @@ collect_ignore = ['setup.py'] [tool:pylint] good-names = X,y + +[doc8] +max-line-length = 99 + +[pydocstyle] +add-ignore = D403,D413,D105,D107 diff --git a/setup.py b/setup.py index f6991ab1..c73eb0a6 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,10 @@ # Advanced testing 'tox>=2.9.1', 'coverage>=4.5.1', + + # Documentation style + 'doc8==0.8.0', + 'pydocstyle==3.0.0' ] From 711201650e50e7ef0c3861347ac89abfa1a5c77d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 9 May 2019 15:42:10 +0200 Subject: [PATCH 11/11] Add missing dependency --- setup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index c73eb0a6..f355be93 100644 --- a/setup.py +++ b/setup.py @@ -40,9 +40,10 @@ 'm2r>=0.2.0', 'Sphinx>=1.7.1', 'sphinx_rtd_theme>=0.2.4', - 'graphviz==0.9', - 'ipython==6.5.0', - 'matplotlib==2.2.3', + 'graphviz>=0.9', + 'ipython>=6.5.0', + 'matplotlib>=2.2.3', + 'autodocsumm>=0.1.10', # style check 'flake8>=3.5.0', @@ -61,8 +62,8 @@ 'coverage>=4.5.1', # Documentation style - 'doc8==0.8.0', - 'pydocstyle==3.0.0' + 'doc8>=0.8.0', + 'pydocstyle>=3.0.0' ]