automl · eddiebergman · Dec 1, 2021 · Sep 7, 2021 · Sep 8, 2021 · Sep 8, 2021
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -1,5 +1,6 @@
 # -*- encoding: utf-8 -*-
 import copy
+import distro
 import io
 import json
 import platform
@@ -689,11 +690,10 @@ def fit(
         self._logger.debug('Starting to print environment information')
         self._logger.debug('  Python version: %s', sys.version.split('\n'))
         try:
-            self._logger.debug('  Distribution: %s', platform.linux_distribution())
+            self._logger.debug(f'\tDistribution: {distro.id()}-{distro.version()}-{distro.name()}')
         except AttributeError:
-            # platform.linux_distribution() was removed in Python3.8
-            # We should move to the distro package as soon as it supports Windows and OSX
             pass
+
         self._logger.debug('  System: %s', platform.system())
         self._logger.debug('  Machine: %s', platform.machine())
         self._logger.debug('  Platform: %s', platform.platform())

diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -228,13 +228,13 @@ def __init__(
         Attributes
         ----------
 
-        cv_results\_ : dict of numpy (masked) ndarrays
+        cv_results_ : dict of numpy (masked) ndarrays
             A dict with keys as column headers and values as columns, that can be
             imported into a pandas ``DataFrame``.
 
             Not all keys returned by scikit-learn are supported yet.
 
-        performance_over_time\_ : pandas.core.frame.DataFrame
+        performance_over_time_ : pandas.core.frame.DataFrame
             A ``DataFrame`` containing the models performance over time data. Can be
             used for plotting directly. Please refer to the example
             :ref:`Train and Test Inputs <sphx_glr_examples_40_advanced_example_pandas_train_test.py>`.

diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py
@@ -184,7 +184,7 @@ def _calculate(self, X, y, logger, categorical):
     def _calculate_sparse(self, X, y, logger, categorical):
         data = [True if not np.isfinite(x) else False for x in X.data]
         missing = X.__class__((data, X.indices, X.indptr), shape=X.shape,
-                              dtype=np.bool)
+                              dtype=bool)
         return missing
 
 

diff --git a/autosklearn/metalearning/metalearning/meta_base.py b/autosklearn/metalearning/metalearning/meta_base.py
@@ -1,3 +1,5 @@
+from collections import OrderedDict
+
 import numpy as np
 import pandas as pd
 
@@ -39,7 +41,7 @@ def __init__(self, configuration_space, aslib_directory, logger):
 
         aslib_reader = aslib_simple.AlgorithmSelectionProblem(self.aslib_directory)
         self.metafeatures = aslib_reader.metafeatures
-        self.algorithm_runs = aslib_reader.algorithm_runs
+        self.algorithm_runs: OrderedDict[str, pd.DataFrame] = aslib_reader.algorithm_runs
         self.configurations = aslib_reader.configurations
 
         configurations = dict()
@@ -65,7 +67,7 @@ def add_dataset(self, name, metafeatures):
             self.metafeatures.drop(name.lower(), inplace=True)
         self.metafeatures = self.metafeatures.append(metafeatures)
 
-        runs = pd.Series([], name=name)
+        runs = pd.Series([], name=name, dtype=float)
         for metric in self.algorithm_runs.keys():
             self.algorithm_runs[metric].append(runs)
 

diff --git a/autosklearn/metalearning/optimizers/metalearn_optimizer/metalearner.py b/autosklearn/metalearning/optimizers/metalearn_optimizer/metalearner.py
@@ -111,7 +111,8 @@ def _learn(self, exclude_double_configurations=True):
                 except KeyError:
                     # TODO should I really except this?
                     self.logger.info("Could not find runs for instance %s" % task_id)
-                    runs[task_id] = pd.Series([], name=task_id)
+                    runs[task_id] = pd.Series([], name=task_id, dtype=np.float64)
+
             runs = pd.DataFrame(runs)
 
             kND.fit(all_other_metafeatures, runs)

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
@@ -1,5 +1,6 @@
 from abc import ABCMeta, abstractmethod
 from functools import partial
+from itertools import product
 from typing import Any, Callable, Dict, List, Optional, Union, cast
 
 import numpy as np
@@ -278,16 +279,14 @@ def make_scorer(
                                     optimum=0,
                                     worst_possible_result=MAXINT,
                                     greater_is_better=False)
-r2 = make_scorer('r2',
-                 sklearn.metrics.r2_score)
+
+r2 = make_scorer('r2', sklearn.metrics.r2_score)
 
 # Standard Classification Scores
 accuracy = make_scorer('accuracy',
                        sklearn.metrics.accuracy_score)
 balanced_accuracy = make_scorer('balanced_accuracy',
                                 sklearn.metrics.balanced_accuracy_score)
-f1 = make_scorer('f1',
-                 sklearn.metrics.f1_score)
 
 # Score functions that need decision values
 roc_auc = make_scorer('roc_auc',
@@ -297,10 +296,20 @@ def make_scorer(
 average_precision = make_scorer('average_precision',
                                 sklearn.metrics.average_precision_score,
                                 needs_threshold=True)
-precision = make_scorer('precision',
-                        sklearn.metrics.precision_score)
-recall = make_scorer('recall',
-                     sklearn.metrics.recall_score)
+
+# NOTE: zero_division
+#
+#   Specified as the explicit default, see sklearn docs:
+#   https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn-metrics-precision-score
+precision = make_scorer(
+    'precision', partial(sklearn.metrics.precision_score, zero_division=0)
+)
+recall = make_scorer(
+    'recall', partial(sklearn.metrics.recall_score, zero_division=0)
+)
+f1 = make_scorer(
+    'f1', partial(sklearn.metrics.f1_score, zero_division=0)
+)
 
 # Score function for probabilistic classification
 log_loss = make_scorer('log_loss',
@@ -312,29 +321,39 @@ def make_scorer(
 # TODO what about mathews correlation coefficient etc?
 
 
-REGRESSION_METRICS = dict()
-for scorer in [mean_absolute_error, mean_squared_error, root_mean_squared_error,
-               mean_squared_log_error, median_absolute_error, r2]:
-    REGRESSION_METRICS[scorer.name] = scorer
-
-CLASSIFICATION_METRICS = dict()
-
-for scorer in [accuracy, balanced_accuracy, roc_auc, average_precision,
-               log_loss]:
-    CLASSIFICATION_METRICS[scorer.name] = scorer
-
-for name, metric in [('precision', sklearn.metrics.precision_score),
-                     ('recall', sklearn.metrics.recall_score),
-                     ('f1', sklearn.metrics.f1_score)]:
-    globals()[name] = make_scorer(name, metric)
-    CLASSIFICATION_METRICS[name] = globals()[name]
-    for average in ['macro', 'micro', 'samples', 'weighted']:
-        qualified_name = '{0}_{1}'.format(name, average)
-        globals()[qualified_name] = make_scorer(qualified_name,
-                                                partial(metric,
-                                                        pos_label=None,
-                                                        average=average))
-        CLASSIFICATION_METRICS[qualified_name] = globals()[qualified_name]
+REGRESSION_METRICS = {
+    scorer.name: scorer
+    for scorer in [
+        mean_absolute_error, mean_squared_error, root_mean_squared_error,
+        mean_squared_log_error, median_absolute_error, r2
+    ]
+}
+
+CLASSIFICATION_METRICS = {
+    scorer.name: scorer
+    for scorer in [
+        accuracy, balanced_accuracy, roc_auc, average_precision, log_loss
+    ]
+}
+
+# NOTE: zero_division
+#
+#   Specified as the explicit default, see sklearn docs:
+#   https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn-metrics-precision-score
+for (base_name, sklearn_metric), average in product(
+    [
+        ('precision', sklearn.metrics.precision_score),
+        ('recall', sklearn.metrics.recall_score),
+        ('f1', sklearn.metrics.f1_score),
+    ],
+    ['macro', 'micro', 'samples', 'weighted']
+):
+    name = f'{base_name}_{average}'
+    scorer = make_scorer(
+        name, partial(sklearn_metric, pos_label=None, average=average, zero_division=0)
+    )
+    globals()[name] = scorer  # Adds scorer to the module scope
+    CLASSIFICATION_METRICS[name] = scorer
 
 
 def calculate_score(

diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py
@@ -144,13 +144,16 @@ def __str__(self):
 
 
 class IterativeComponent(AutoSklearnComponent):
+
     def fit(self, X, y, sample_weight=None):
         self.iterative_fit(X, y, n_iter=2, refit=True)
+
         iteration = 2
         while not self.configuration_fully_fitted():
             n_iter = int(2 ** iteration / 2)
             self.iterative_fit(X, y, n_iter=n_iter, refit=False)
             iteration += 1
+
         return self
 
     @staticmethod
@@ -162,15 +165,16 @@ def get_current_iter(self):
 
 
 class IterativeComponentWithSampleWeight(AutoSklearnComponent):
+
     def fit(self, X, y, sample_weight=None):
-        self.iterative_fit(
-            X, y, n_iter=2, refit=True, sample_weight=sample_weight
-        )
+        self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight)
+
         iteration = 2
         while not self.configuration_fully_fitted():
             n_iter = int(2 ** iteration / 2)
-            self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight)
+            self.iterative_fit(X, y, n_iter=n_iter, refit=False, sample_weight=sample_weight)
             iteration += 1
+
         return self
 
     @staticmethod

diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py
@@ -27,7 +27,14 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
                 categories='auto', handle_unknown='use_encoded_value', unknown_value=-1,
             )
             self.preprocessor.fit(X, y)
-        return self
+            return self
+        else:
+            # TODO sparse_encoding of negative labels
+            #
+            #   The next step in the pipeline relies on positive labels
+            #   Given a categorical column [[0], [-1]], the next step will fail
+            #   unless we can fix this encoding
+            return self
 
     def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
         if scipy.sparse.issparse(X):

diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py
@@ -3,6 +3,7 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 
 import numpy as np
+from scipy.sparse import spmatrix
 
 from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
 from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
@@ -28,24 +29,32 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
             y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation':
         import sklearn.impute
 
-        fill_value = None
         if hasattr(X, 'columns'):
             kind = X[X.columns[-1]].dtype.kind
         else:
             # Series, sparse and numpy have dtype
             # Only DataFrame does not
             kind = X.dtype.kind
-        if kind in ("i", "u", "f"):
-            # We do not want to impute a category with the default
-            # value (0 is the default) in case such default is in the
-            # train data already!
-            fill_value = 0
-            unique = np.unique(X)
-            while fill_value in unique:
-                fill_value -= 1
+
+        fill_value: Optional[int] = None
+
+        number_kinds = ("i", "u", "f")
+        if kind in number_kinds:
+            if isinstance(X, spmatrix):
+                # TODO negative labels
+                #
+                #   Previously this was the behaviour and went
+                #   unnoticed. Imputing negative labels results in
+                #   the cateogircal shift step failing as the ordinal
+                #   encoder can't fix negative labels.
+                #   This is here to document the behaviour explicitly
+                fill_value = 0
+            else:
+                fill_value = min(np.unique(X)) - 1
 
         self.preprocessor = sklearn.impute.SimpleImputer(
-            strategy='constant', copy=False, fill_value=fill_value)
+            strategy='constant', copy=False, fill_value=fill_value
+        )
         self.preprocessor.fit(X)
         return self
 

diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py
@@ -19,17 +19,27 @@ def __init__(
     ) -> None:
         self.preprocessor: Optional[BaseEstimator] = None
 
-    def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
-            ) -> 'AutoSklearnPreprocessingAlgorithm':
+    def fit(
+        self,
+        X: PIPELINE_DATA_DTYPE,
+        y: Optional[PIPELINE_DATA_DTYPE] = None
+    ) -> 'AutoSklearnPreprocessingAlgorithm':
+
         if self.preprocessor is None:
             raise NotFittedError()
+
         self.preprocessor.fit(X)
+
         return self
 
     def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
+
         if self.preprocessor is None:
-            raise NotImplementedError()
-        return self.preprocessor.transform(X)
+            raise NotFittedError()
+
+        transformed_X = self.preprocessor.transform(X)
+
+        return transformed_X
 
     @staticmethod
     def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None

diff --git a/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py b/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py
@@ -1,3 +1,6 @@
+from typing import Optional, Union
+
+from numpy.random import RandomState
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
     UniformIntegerHyperparameter
@@ -8,13 +11,23 @@
 
 class RandomKitchenSinks(AutoSklearnPreprocessingAlgorithm):
 
-    def __init__(self, gamma, n_components, random_state=None):
-        """ Parameters:
+    def __init__(
+        self,
+        gamma: float,
+        n_components: int,
+        random_state: Optional[Union[int, RandomState]] = None
+    ) -> None:
+        """
+        Parameters
+        ----------
         gamma: float
-               Parameter of the rbf kernel to be approximated exp(-gamma * x^2)
+            Parameter of the rbf kernel to be approximated exp(-gamma * x^2)
 
         n_components: int
-               Number of components (output dimensionality) used to approximate the kernel
+            Number of components (output dimensionality) used to approximate the kernel
+
+        random_state: Optional[int | RandomState]
+            The random state to pass to the underlying estimator
         """
         self.gamma = gamma
         self.n_components = n_components
@@ -27,7 +40,10 @@ def fit(self, X, Y=None):
         self.gamma = float(self.gamma)
 
         self.preprocessor = sklearn.kernel_approximation.RBFSampler(
-            self.gamma, self.n_components, self.random_state)
+            gamma=self.gamma,
+            n_components=self.n_components,
+            random_state=self.random_state
+        )
         self.preprocessor.fit(X)
         return self