Skip to content

Commit

Permalink
Merge pull request #38 from ing-bank/7-unify-preprocessing
Browse files Browse the repository at this point in the history
Unify data preparation
  • Loading branch information
Mateusz Garbacz authored Nov 26, 2020
2 parents 018477a + 8d40d69 commit 2723c88
Show file tree
Hide file tree
Showing 19 changed files with 662 additions and 279 deletions.
1 change: 1 addition & 0 deletions .github/workflows/publish_to_pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
- name: Make sure unit tests succeed
run: |
pip install .
pip install -r requirements_test.txt
pytest
- name: Build package & publish to PyPi
env:
Expand Down
19 changes: 16 additions & 3 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,18 @@ jobs:
name: Run unit tests
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
matrix:
build: [macos, ubuntu, windows]
include:
- build: macos
os: macos-latest
SKIP_LIGHTGBM: True
- build: ubuntu
os: ubuntu-latest
SKIP_LIGHTGBM: False
- build: windows
os: windows-latest
SKIP_LIGHTGBM: False
python-version: [3.6, 3.7, 3.8]
steps:
- uses: actions/checkout@master
Expand All @@ -22,12 +32,15 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Static code checking with pyflakes
env:
SKIP_LIGHTGBM: ${{ matrix.SKIP_LIGHTGBM }}
run: |
pip install pytest
pip install pytest
pip install pytest-cov
pip install pyflakes
pip install mypy
pip install .
pip install -r tests/requirements_test.txt
pytest --cov=probatus/binning --cov=probatus/metric_volatility --cov=probatus/sample_similarity --cov=probatus/stat_tests --cov=probatus/utils --cov=probatus/interpret/ --ignore==tests/interpret/test_inspector.py --cov-report term-missing
pyflakes probatus
mypy probatus --ignore-missing-imports
87 changes: 18 additions & 69 deletions probatus/feature_elimination/feature_elimination.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from probatus.utils import assure_pandas_df, shap_calc, calculate_shap_importance, assure_pandas_series, \
BaseFitComputePlotClass
from probatus.utils import preprocess_data, shap_calc, calculate_shap_importance, BaseFitComputePlotClass, \
preprocess_labels
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, check_cv
from sklearn.base import clone, is_classifier
from sklearn.metrics import check_scoring
Expand Down Expand Up @@ -160,60 +159,6 @@ def __init__(self, clf, step=1, min_features_to_select=1, cv=None, scoring=None,
self.verbose = verbose


@staticmethod
def _preprocess_data(X, verbose=0):
"""
Does basic preprocessing of the data: Removal of static features, Warns which features have missing variables,
and transform object dtype features to category type, such that LightGBM handles them by default.
Args:
X (pd.DataFrame):
Provided dataset.
verbose (int, optional):
Controls verbosity of the output:
- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).
Returns:
(pd.DataFrame):
Preprocessed dataset.
"""
# Make sure that X is a pd.DataFrame
X = assure_pandas_df(X)

# Remove static features, those that have only one value for all samples
static_features = [i for i in X.columns if len(X[i].unique()) == 1]
if len(static_features)>0:
if verbose > 0:
warnings.warn(f'Removing static features {static_features}.')
X = X.drop(columns=static_features)

# Warn if missing
columns_with_missing = [column for column in X.columns if X[column].isnull().values.any()]
if len(columns_with_missing) > 0:
if verbose > 0:
warnings.warn(f'The following variables contain missing values {columns_with_missing}. Make sure to '
f'impute missing or apply a model that handles them automatically.')

# Transform Categorical variables into category dtype
indices_obj_dtype_features = [column[0] for column in enumerate(X.dtypes) if column[1] == 'O']
obj_dtype_features = list(X.columns[indices_obj_dtype_features])

# Set categorical features type to category
if len(obj_dtype_features) > 0:
if verbose > 0:
warnings.warn(f'Changing dtype of {obj_dtype_features} from "object" to "category". Treating it as '
f'categorical variable. Make sure that the model handles categorical variables, or encode'
f' them first.')
for obj_dtype_feature in obj_dtype_features:
X[obj_dtype_feature] = X[obj_dtype_feature].astype('category')
return X


def _get_current_features_to_remove(self, shap_importance_df):
"""
Implements the logic used to determine which features to remove. If step is a positive integer,
Expand Down Expand Up @@ -379,17 +324,12 @@ def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer,
score_train = scorer(clf, X_train, y_train)
score_val = scorer(clf, X_val, y_val)

if verbose > 100:
suppress_warnings = False
else:
suppress_warnings = True

# Compute SHAP values
shap_values = shap_calc(clf, X_val, suppress_warnings=suppress_warnings)
shap_values = shap_calc(clf, X_val, verbose=verbose)
return shap_values, score_train, score_val


def fit(self, X, y):
def fit(self, X, y, column_names=None):
"""
Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
Expand All @@ -404,18 +344,22 @@ def fit(self, X, y):
y (pd.Series):
Binary labels for X.
column_names (list of str, optional):
List of feature names of the provided samples. If provided it will be used to overwrite the existing
feature names. If not provided the existing feature names are used or default feature names are
generated.
Returns:
(ShapRFECV): Fitted object.
"""
# Set seed for results reproducibility
if self.random_state is not None:
np.random.seed(self.random_state)

self.X = self._preprocess_data(X, verbose=self.verbose)
self.y = assure_pandas_series(y, index=self.X.index)
self.X , self.column_names = preprocess_data(X, X_name='X', column_names=column_names, verbose=self.verbose)
self.y = preprocess_labels(y, y_name='y', index=self.X.index, verbose=self.verbose)
self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

remaining_features = current_features_set = self.X.columns.tolist()
remaining_features = current_features_set = self.column_names
round_number = 0

while len(current_features_set) > self.min_features_to_select:
Expand Down Expand Up @@ -485,7 +429,7 @@ def compute(self):
return self.report_df


def fit_compute(self, X, y):
def fit_compute(self, X, y, column_names=None):
"""
Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
Expand All @@ -501,12 +445,17 @@ def fit_compute(self, X, y):
y (pd.Series):
Binary labels for X.
column_names (list of str, optional):
List of feature names of the provided samples. If provided it will be used to overwrite the existing
feature names. If not provided the existing feature names are used or default feature names are
generated.
Returns:
(pd.DataFrame):
DataFrame containing results of feature elimination from each iteration.
"""

self.fit(X, y)
self.fit(X, y, column_names=column_names)
return self.compute()


Expand Down
33 changes: 21 additions & 12 deletions probatus/interpret/model_interpret.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@


from probatus.interpret import TreeDependencePlotter
from probatus.utils import assure_column_names_consistency, assure_pandas_df, shap_calc, assure_list_of_strings,\
calculate_shap_importance, BaseFitComputePlotClass
from probatus.utils import preprocess_data, preprocess_labels, shap_calc, calculate_shap_importance, \
BaseFitComputePlotClass, assure_list_of_strings
from sklearn.metrics import roc_auc_score
import numpy as np
import shap
Expand Down Expand Up @@ -70,15 +70,24 @@ class ShapModelInterpreter(BaseFitComputePlotClass):
"""


def __init__(self, clf):
def __init__(self, clf, verbose=0):
"""
Initializes the class.
Args:
clf (binary classifier):
Model fitted on X_train.
verbose (int, optional):
Controls verbosity of the output:
- 0 - nether prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).
"""
self.clf = clf
self.verbose = verbose


def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=None, approximate=False,
Expand Down Expand Up @@ -113,19 +122,17 @@ def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=N
keyword arguments passed to [shap.TreeExplainer](https://shap.readthedocs.io/en/latest/generated/shap.TreeExplainer.html).
"""

self.X_train = assure_pandas_df(X_train)
self.X_test = assure_pandas_df(X_test)
self.y_train = y_train
self.y_test = y_test
self.X_train, self.column_names = preprocess_data(X_train, X_name='X_train', column_names=column_names,
verbose=self.verbose)
self.X_test, _ = preprocess_data(X_test, X_name='X_test', column_names=column_names, verbose=self.verbose)
self.y_train = preprocess_labels(y_train, y_name='y_train', index=self.X_train.index, verbose=self.verbose)
self.y_test = preprocess_labels(y_test, y_name='y_test', index=self.X_test.index, verbose=self.verbose)

# Set class names
self.class_names = class_names
if self.class_names is None:
self.class_names = ['Negative Class', 'Positive Class']

# Set column names
self.column_names = assure_column_names_consistency(column_names, self.X_train)

# Calculate Metrics
self.auc_train = roc_auc_score(self.y_train, self.clf.predict_proba(self.X_train)[:, 1])
self.auc_test = roc_auc_score(self.y_test, self.clf.predict_proba(self.X_test)[:, 1])
Expand All @@ -135,7 +142,8 @@ def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=N
)

self.shap_values, self.explainer = shap_calc(self.clf, self.X_test, approximate=approximate,
return_explainer=True, data=self.X_train, **shap_kwargs)
verbose=self.verbose, return_explainer=True,
**shap_kwargs)

# Get expected_value from the explainer
self.expected_value = self.explainer.expected_value
Expand All @@ -144,7 +152,8 @@ def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=N
self.expected_value = self.expected_value[1]

# Initialize tree dependence plotter
self.tdp = TreeDependencePlotter(self.clf).fit(self.X_test, self.y_test, precalc_shap=self.shap_values)
self.tdp = TreeDependencePlotter(self.clf, verbose=self.verbose).fit(self.X_test, self.y_test,
precalc_shap=self.shap_values)

self.fitted = True

Expand Down
35 changes: 26 additions & 9 deletions probatus/interpret/shap_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import matplotlib.pyplot as plt

from probatus.binning import SimpleBucketer, AgglomerativeBucketer, QuantileBucketer
from probatus.utils import BaseFitComputePlotClass, assure_pandas_df, shap_to_df
from probatus.utils import BaseFitComputePlotClass, shap_to_df, preprocess_data, preprocess_labels


class TreeDependencePlotter(BaseFitComputePlotClass):
Expand All @@ -47,15 +47,32 @@ class TreeDependencePlotter(BaseFitComputePlotClass):
```
"""

def __init__(self, model):
self.model = model
def __init__(self, model, verbose=0):
"""
Initializes the class
Args:
model (binary classifier):
Model fitted on X_train.
verbose (int, optional):
Controls verbosity of the output:
- 0 - nether prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).
"""
self.model = model
self.verbose = verbose
self.target_names = [1, 0]


def __repr__(self):
return "Shap dependence plotter for {}".format(self.model.__class__.__name__)

def fit(self, X, y, precalc_shap=None):

def fit(self, X, y, column_names=None, precalc_shap=None):
"""
Fits the plotter to the model and data by computing the shap values. If the shap_values are passed, they do not
need to be computed
Expand All @@ -70,15 +87,15 @@ def fit(self, X, y, precalc_shap=None):
precalc_shap (Optional, None or np.array):
Precalculated shap values, If provided they don't need to be computed.
"""
self.X = assure_pandas_df(X)
self.y = y
self.features = self.X.columns
self.X, self.column_names = preprocess_data(X, X_name='X', column_names=column_names, verbose=self.verbose)
self.y = preprocess_labels(y, y_name='y', index=self.X.index, verbose=self.verbose)

self.shap_vals_df = shap_to_df(self.model, self.X, precalc_shap=precalc_shap)
self.shap_vals_df = shap_to_df(self.model, self.X, precalc_shap=precalc_shap, verbose=self.verbose)

self.fitted = True
return self


def compute(self):
"""
Computes the report returned to the user, namely the SHAP values generated on the dataset.
Expand Down Expand Up @@ -171,7 +188,7 @@ def _dependence_plot(self, feature, ax=None, figsize=(15, 10)):
matplotlib.pyplot.axes: axes on which plot is drawn.
"""
if type(feature) is int:
feature = self.features[feature]
feature = self.column_names[feature]

X, y, shap_val = self._get_X_y_shap_with_q_cut(feature=feature)

Expand Down
2 changes: 1 addition & 1 deletion probatus/metric_volatility/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def sample_data(X, y, sampling_type, sampling_fraction, dataset_name='dataset'):
return X,y
else:
rows_indexes = np.random.choice(array_index, number_of_samples, replace=True)
return X[rows_indexes], y[rows_indexes]
return X.iloc[rows_indexes], y.iloc[rows_indexes]


def check_sampling_input(sampling_type, fraction, dataset_name):
Expand Down
Loading

0 comments on commit 2723c88

Please sign in to comment.