Merge pull request #38 from ing-bank/7-unify-preprocessing

Unify data preparation
ing-bank · Nov 26, 2020 · 2723c88 · 2723c88
2 parents 018477a + 8d40d69
commit 2723c88
Show file tree

Hide file tree

Showing 19 changed files with 662 additions and 279 deletions.
diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml
@@ -21,6 +21,7 @@ jobs:
     - name: Make sure unit tests succeed
       run: |
         pip install .
+        pip install -r requirements_test.txt
         pytest
     - name: Build package & publish to PyPi
       env:

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -12,8 +12,18 @@ jobs:
     name: Run unit tests
     runs-on: ${{ matrix.os }}
     strategy:
-      matrix: 
-        os: [ubuntu-latest, macos-latest, windows-latest]
+      matrix:
+        build: [macos, ubuntu, windows]
+        include:
+          - build: macos
+            os: macos-latest
+            SKIP_LIGHTGBM: True
+          - build: ubuntu
+            os: ubuntu-latest
+            SKIP_LIGHTGBM: False
+          - build: windows
+            os: windows-latest
+            SKIP_LIGHTGBM: False
         python-version: [3.6, 3.7, 3.8]
     steps:
     - uses: actions/checkout@master
@@ -22,12 +32,15 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
     - name: Static code checking with pyflakes
+      env:
+        SKIP_LIGHTGBM: ${{ matrix.SKIP_LIGHTGBM }}
       run: |
-        pip install pytest 
+        pip install pytest
         pip install pytest-cov
         pip install pyflakes
         pip install mypy
         pip install .
+        pip install -r tests/requirements_test.txt
         pytest --cov=probatus/binning --cov=probatus/metric_volatility --cov=probatus/sample_similarity --cov=probatus/stat_tests --cov=probatus/utils --cov=probatus/interpret/ --ignore==tests/interpret/test_inspector.py --cov-report term-missing
         pyflakes probatus
         mypy probatus --ignore-missing-imports
diff --git a/probatus/feature_elimination/feature_elimination.py b/probatus/feature_elimination/feature_elimination.py
@@ -1,9 +1,8 @@
-from probatus.utils import assure_pandas_df, shap_calc, calculate_shap_importance, assure_pandas_series, \
-    BaseFitComputePlotClass
+from probatus.utils import preprocess_data, shap_calc, calculate_shap_importance, BaseFitComputePlotClass, \
+    preprocess_labels
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-import warnings
 from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, check_cv
 from sklearn.base import clone, is_classifier
 from sklearn.metrics import check_scoring
@@ -160,60 +159,6 @@ def __init__(self, clf, step=1, min_features_to_select=1, cv=None, scoring=None,
         self.verbose = verbose
 
 
-    @staticmethod
-    def _preprocess_data(X, verbose=0):
-        """
-        Does basic preprocessing of the data: Removal of static features, Warns which features have missing variables,
-        and transform object dtype features to category type, such that LightGBM handles them by default.
-
-        Args:
-            X (pd.DataFrame):
-                Provided dataset.
-
-            verbose (int, optional):
-                Controls verbosity of the output:
-
-                - 0 - neither prints nor warnings are shown
-                - 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
-                - 51 - 100 - shows most important warnings, prints of the feature removal process
-                - above 100 - presents all prints and all warnings (including SHAP warnings).
-
-        Returns:
-            (pd.DataFrame):
-                Preprocessed dataset.
-        """
-        # Make sure that X is a pd.DataFrame
-        X = assure_pandas_df(X)
-
-        # Remove static features, those that have only one value for all samples
-        static_features = [i for i in X.columns if len(X[i].unique()) == 1]
-        if len(static_features)>0:
-            if verbose > 0:
-                warnings.warn(f'Removing static features {static_features}.')
-            X = X.drop(columns=static_features)
-
-        # Warn if missing
-        columns_with_missing = [column for column in X.columns if X[column].isnull().values.any()]
-        if len(columns_with_missing) > 0:
-            if verbose > 0:
-                warnings.warn(f'The following variables contain missing values {columns_with_missing}. Make sure to '
-                              f'impute missing or apply a model that handles them automatically.')
-
-        # Transform Categorical variables into category dtype
-        indices_obj_dtype_features = [column[0] for column in enumerate(X.dtypes) if column[1] == 'O']
-        obj_dtype_features = list(X.columns[indices_obj_dtype_features])
-
-        # Set categorical features type to category
-        if len(obj_dtype_features) > 0:
-            if verbose > 0:
-                warnings.warn(f'Changing dtype of {obj_dtype_features} from "object" to "category". Treating it as '
-                              f'categorical variable. Make sure that the model handles categorical variables, or encode'
-                              f' them first.')
-            for obj_dtype_feature in obj_dtype_features:
-                X[obj_dtype_feature] = X[obj_dtype_feature].astype('category')
-        return X
-
-
     def _get_current_features_to_remove(self, shap_importance_df):
         """
         Implements the logic used to determine which features to remove. If step is a positive integer,
@@ -379,17 +324,12 @@ def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer,
         score_train = scorer(clf, X_train, y_train)
         score_val = scorer(clf, X_val, y_val)
 
-        if verbose > 100:
-            suppress_warnings = False
-        else:
-            suppress_warnings = True
-
         # Compute SHAP values
-        shap_values = shap_calc(clf, X_val, suppress_warnings=suppress_warnings)
+        shap_values = shap_calc(clf, X_val, verbose=verbose)
         return shap_values, score_train, score_val
 
 
-    def fit(self, X, y):
+    def fit(self, X, y, column_names=None):
         """
         Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
              eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
@@ -404,18 +344,22 @@ def fit(self, X, y):
             y (pd.Series):
                 Binary labels for X.
 
+            column_names (list of str, optional):
+                List of feature names of the provided samples. If provided it will be used to overwrite the existing
+                feature names. If not provided the existing feature names are used or default feature names are
+                generated.
         Returns:
             (ShapRFECV): Fitted object.
         """
         # Set seed for results reproducibility
         if self.random_state is not None:
             np.random.seed(self.random_state)
 
-        self.X = self._preprocess_data(X, verbose=self.verbose)
-        self.y = assure_pandas_series(y, index=self.X.index)
+        self.X , self.column_names = preprocess_data(X, X_name='X', column_names=column_names, verbose=self.verbose)
+        self.y = preprocess_labels(y, y_name='y', index=self.X.index, verbose=self.verbose)
         self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))
 
-        remaining_features = current_features_set = self.X.columns.tolist()
+        remaining_features = current_features_set = self.column_names
         round_number = 0
 
         while len(current_features_set) > self.min_features_to_select:
@@ -485,7 +429,7 @@ def compute(self):
         return self.report_df
 
 
-    def fit_compute(self, X, y):
+    def fit_compute(self, X, y, column_names=None):
         """
         Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
              eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
@@ -501,12 +445,17 @@ def fit_compute(self, X, y):
             y (pd.Series):
                 Binary labels for X.
 
+            column_names (list of str, optional):
+                List of feature names of the provided samples. If provided it will be used to overwrite the existing
+                feature names. If not provided the existing feature names are used or default feature names are
+                generated.
+
         Returns:
             (pd.DataFrame):
                 DataFrame containing results of feature elimination from each iteration.
         """
 
-        self.fit(X, y)
+        self.fit(X, y, column_names=column_names)
         return self.compute()
 
 

diff --git a/probatus/interpret/model_interpret.py b/probatus/interpret/model_interpret.py
@@ -19,8 +19,8 @@
 
 
 from probatus.interpret import TreeDependencePlotter
-from probatus.utils import assure_column_names_consistency, assure_pandas_df, shap_calc, assure_list_of_strings,\
-    calculate_shap_importance, BaseFitComputePlotClass
+from probatus.utils import preprocess_data, preprocess_labels, shap_calc, calculate_shap_importance, \
+    BaseFitComputePlotClass, assure_list_of_strings
 from sklearn.metrics import roc_auc_score
 import numpy as np
 import shap
@@ -70,15 +70,24 @@ class ShapModelInterpreter(BaseFitComputePlotClass):
     """
 
 
-    def __init__(self, clf):
+    def __init__(self, clf, verbose=0):
         """
         Initializes the class.
 
         Args:
             clf (binary classifier):
                 Model fitted on X_train.
+
+            verbose (int, optional):
+                Controls verbosity of the output:
+
+                - 0 - nether prints nor warnings are shown
+                - 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
+                - 51 - 100 - shows most important warnings, prints of the feature removal process
+                - above 100 - presents all prints and all warnings (including SHAP warnings).
         """
         self.clf = clf
+        self.verbose = verbose
 
 
     def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=None, approximate=False,
@@ -113,19 +122,17 @@ def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=N
                 keyword arguments passed to [shap.TreeExplainer](https://shap.readthedocs.io/en/latest/generated/shap.TreeExplainer.html).
         """
 
-        self.X_train = assure_pandas_df(X_train)
-        self.X_test = assure_pandas_df(X_test)
-        self.y_train = y_train
-        self.y_test = y_test
+        self.X_train, self.column_names = preprocess_data(X_train, X_name='X_train', column_names=column_names,
+                                                          verbose=self.verbose)
+        self.X_test, _ = preprocess_data(X_test, X_name='X_test', column_names=column_names, verbose=self.verbose)
+        self.y_train = preprocess_labels(y_train, y_name='y_train', index=self.X_train.index, verbose=self.verbose)
+        self.y_test = preprocess_labels(y_test, y_name='y_test', index=self.X_test.index, verbose=self.verbose)
 
         # Set class names
         self.class_names = class_names
         if self.class_names is None:
             self.class_names = ['Negative Class', 'Positive Class']
 
-        # Set column names
-        self.column_names = assure_column_names_consistency(column_names, self.X_train)
-
         # Calculate Metrics
         self.auc_train = roc_auc_score(self.y_train, self.clf.predict_proba(self.X_train)[:, 1])
         self.auc_test = roc_auc_score(self.y_test, self.clf.predict_proba(self.X_test)[:, 1])
@@ -135,7 +142,8 @@ def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=N
         )
 
         self.shap_values, self.explainer = shap_calc(self.clf, self.X_test, approximate=approximate,
-                                                     return_explainer=True, data=self.X_train, **shap_kwargs)
+                                                     verbose=self.verbose, return_explainer=True,
+                                                     **shap_kwargs)
 
         # Get expected_value from the explainer
         self.expected_value = self.explainer.expected_value
@@ -144,7 +152,8 @@ def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=N
             self.expected_value = self.expected_value[1]
 
         # Initialize tree dependence plotter
-        self.tdp = TreeDependencePlotter(self.clf).fit(self.X_test, self.y_test, precalc_shap=self.shap_values)
+        self.tdp = TreeDependencePlotter(self.clf, verbose=self.verbose).fit(self.X_test, self.y_test,
+                                                                             precalc_shap=self.shap_values)
 
         self.fitted = True
 

diff --git a/probatus/interpret/shap_dependence.py b/probatus/interpret/shap_dependence.py
@@ -23,7 +23,7 @@
 import matplotlib.pyplot as plt
 
 from probatus.binning import SimpleBucketer, AgglomerativeBucketer, QuantileBucketer
-from probatus.utils import BaseFitComputePlotClass, assure_pandas_df, shap_to_df
+from probatus.utils import BaseFitComputePlotClass, shap_to_df, preprocess_data, preprocess_labels
 
 
 class TreeDependencePlotter(BaseFitComputePlotClass):
@@ -47,15 +47,32 @@ class TreeDependencePlotter(BaseFitComputePlotClass):
     ```
     """
 
-    def __init__(self, model):
-        self.model = model
+    def __init__(self, model, verbose=0):
+        """
+        Initializes the class
+
+        Args:
+            model (binary classifier):
+                Model fitted on X_train.
+
+            verbose (int, optional):
+                Controls verbosity of the output:
 
+                - 0 - nether prints nor warnings are shown
+                - 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
+                - 51 - 100 - shows most important warnings, prints of the feature removal process
+                - above 100 - presents all prints and all warnings (including SHAP warnings).
+        """
+        self.model = model
+        self.verbose = verbose
         self.target_names = [1, 0]
 
+
     def __repr__(self):
         return "Shap dependence plotter for {}".format(self.model.__class__.__name__)
 
-    def fit(self, X, y, precalc_shap=None):
+
+    def fit(self, X, y, column_names=None, precalc_shap=None):
         """
         Fits the plotter to the model and data by computing the shap values. If the shap_values are passed, they do not
             need to be computed
@@ -70,15 +87,15 @@ def fit(self, X, y, precalc_shap=None):
             precalc_shap (Optional, None or np.array):
                 Precalculated shap values, If provided they don't need to be computed.
         """
-        self.X = assure_pandas_df(X)
-        self.y = y
-        self.features = self.X.columns
+        self.X, self.column_names = preprocess_data(X, X_name='X', column_names=column_names, verbose=self.verbose)
+        self.y = preprocess_labels(y, y_name='y', index=self.X.index, verbose=self.verbose)
 
-        self.shap_vals_df = shap_to_df(self.model, self.X, precalc_shap=precalc_shap)
+        self.shap_vals_df = shap_to_df(self.model, self.X, precalc_shap=precalc_shap, verbose=self.verbose)
 
         self.fitted = True
         return self
 
+
     def compute(self):
         """
         Computes the report returned to the user, namely the SHAP values generated on the dataset.
@@ -171,7 +188,7 @@ def _dependence_plot(self, feature, ax=None, figsize=(15, 10)):
             matplotlib.pyplot.axes: axes on which plot is drawn.
         """
         if type(feature) is int:
-            feature = self.features[feature]
+            feature = self.column_names[feature]
 
         X, y, shap_val = self._get_X_y_shap_with_q_cut(feature=feature)
 

diff --git a/probatus/metric_volatility/utils.py b/probatus/metric_volatility/utils.py
@@ -37,7 +37,7 @@ def sample_data(X, y, sampling_type, sampling_fraction, dataset_name='dataset'):
             return X,y
         else:
             rows_indexes = np.random.choice(array_index, number_of_samples, replace=True)
-    return X[rows_indexes], y[rows_indexes]
+    return X.iloc[rows_indexes], y.iloc[rows_indexes]
 
 
 def check_sampling_input(sampling_type, fraction, dataset_name):