Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement verbose to ShapRSECV #22

Merged
merged 2 commits into from
Nov 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 50 additions & 50 deletions docs/tutorials/nb_shap_feature_elimination.ipynb

Large diffs are not rendered by default.

75 changes: 55 additions & 20 deletions probatus/feature_elimination/feature_elimination.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ class ShapRFECV:

"""

def __init__(self, clf, step=1, min_features_to_select=1, cv=None, scoring=None, n_jobs=-1, random_state=None):
def __init__(self, clf, step=1, min_features_to_select=1, cv=None, scoring=None, n_jobs=-1, verbose=0,
random_state=None):
"""
This method initializes the class:

Expand Down Expand Up @@ -118,6 +119,14 @@ def __init__(self, clf, step=1, min_features_to_select=1, cv=None, scoring=None,
Number of cores to run in parallel while fitting across folds. None means 1 unless in a
`joblib.parallel_backend` context. -1 means using all processors.

verbose (Optional, int):
Controls verbosity of the output:

- 0 - nether prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).

random_state (Optional, int):
Random state set at each round of feature elimination. If it is None, the results will not be
reproducible and in random search at each iteration a different hyperparameters might be tested. For
Expand Down Expand Up @@ -148,6 +157,7 @@ def __init__(self, clf, step=1, min_features_to_select=1, cv=None, scoring=None,
self.random_state = random_state
self.n_jobs = n_jobs
self.report_df = pd.DataFrame([])
self.verbose = verbose
self.fitted = False


Expand All @@ -160,7 +170,7 @@ def _check_if_fitted(self):


@staticmethod
def _preprocess_data(X):
def _preprocess_data(X, verbose=0):
Matgrb marked this conversation as resolved.
Show resolved Hide resolved
"""
Does basic preprocessing of the data: Removal of static features, Warns which features have missing variables,
and transform object dtype features to category type, such that LightGBM handles them by default.
Expand All @@ -169,6 +179,14 @@ def _preprocess_data(X):
X (pd.DataFrame):
Provided dataset.

verbose (Optional, int):
Controls verbosity of the output:

- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).

Returns:
(pd.DataFrame):
Preprocessed dataset.
Expand All @@ -179,24 +197,27 @@ def _preprocess_data(X):
# Remove static features, those that have only one value for all samples
static_features = [i for i in X.columns if len(X[i].unique()) == 1]
if len(static_features)>0:
warnings.warn(f'Removing static features {static_features}.')
if verbose > 0:
warnings.warn(f'Removing static features {static_features}.')
X = X.drop(columns=static_features)

# Warn if missing
columns_with_missing = [column for column in X.columns if X[column].isnull().values.any()]
if len(columns_with_missing) > 0:
warnings.warn(f'The following variables contain missing values {columns_with_missing}. Make sure to impute'
f'missing or apply a model that handles them automatically.')
if verbose > 0:
warnings.warn(f'The following variables contain missing values {columns_with_missing}. Make sure to '
f'impute missing or apply a model that handles them automatically.')

# Transform Categorical variables into category dtype
indices_obj_dtype_features = [column[0] for column in enumerate(X.dtypes) if column[1] == 'O']
obj_dtype_features = list(X.columns[indices_obj_dtype_features])

# Set categorical features type to category
if len(obj_dtype_features) > 0:
warnings.warn(f'Changing dtype of {obj_dtype_features} from "object" to "category". Treating it as '
f'categorical variable. Make sure that the model handles categorical variables, or encode '
f'them first.')
if verbose > 0:
warnings.warn(f'Changing dtype of {obj_dtype_features} from "object" to "category". Treating it as '
f'categorical variable. Make sure that the model handles categorical variables, or encode'
f' them first.')
for obj_dtype_feature in obj_dtype_features:
X[obj_dtype_feature] = X[obj_dtype_feature].astype('category')
return X
Expand Down Expand Up @@ -321,7 +342,7 @@ def _report_current_results(self, round_number, current_features_set, features_t


@staticmethod
def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer):
def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer, verbose=0):
"""
This function calculates the shap values on validation set, and Train and Val score.

Expand All @@ -345,6 +366,14 @@ def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer)
A string (see sklearn [model scoring](https://scikit-learn.org/stable/modules/model_evaluation.html)) or
a scorer callable object, function with the signature `scorer(estimator, X, y)`.

verbose (Optional, int):
Controls verbosity of the output:

- 0 - neither prints nor warnings are shown
- 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
- 51 - 100 - shows most important warnings, prints of the feature removal process
- above 100 - presents all prints and all warnings (including SHAP warnings).

Returns:
(np.array, float, float):
Tuple with the results: Shap Values on validation fold, train score, validation score.
Expand All @@ -359,8 +388,13 @@ def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer)
score_train = scorer(clf, X_train, y_train)
score_val = scorer(clf, X_val, y_val)

if verbose > 100:
suppress_warnings = False
else:
suppress_warnings = True

# Compute SHAP values
shap_values = shap_calc(clf, X_val, suppress_warnings=True)
shap_values = shap_calc(clf, X_val, suppress_warnings=suppress_warnings)
return shap_values, score_train, score_val


Expand All @@ -383,7 +417,7 @@ def fit(self, X, y):
if self.random_state is not None:
np.random.seed(self.random_state)

self.X = self._preprocess_data(X)
self.X = self._preprocess_data(X, verbose=self.verbose)
self.y = assure_pandas_series(y, index=self.X.index)
self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

Expand All @@ -410,7 +444,8 @@ def fit(self, X, y):

# Perform CV to estimate feature importance with SHAP
results_per_fold = Parallel(n_jobs=self.n_jobs)(delayed(self._get_feature_shap_values_per_fold)(
X=current_X, y=self.y, clf=current_clf, train_index=train_index, val_index=val_index, scorer=self.scorer
X=current_X, y=self.y, clf=current_clf, train_index=train_index, val_index=val_index,
scorer=self.scorer, verbose=self.verbose
) for train_index, val_index in self.cv.split(current_X, self.y))

shap_values = np.vstack([current_result[0] for current_result in results_per_fold])
Expand All @@ -430,14 +465,14 @@ def fit(self, X, y):
train_metric_std = np.round(np.std(scores_train), 3),
val_metric_mean = np.round(np.mean(scores_val), 3),
val_metric_std = np.round(np.std(scores_val), 3))

print(f'Round: {round_number}, Current number of features: {len(current_features_set)}, '
f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
f'{self.report_df.loc[round_number]["val_metric_mean"]} '
f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
f'Num of features left: {len(remaining_features)}. '
f'Removed features at the end of the round: {features_to_remove}')
if self.verbose > 50:
print(f'Round: {round_number}, Current number of features: {len(current_features_set)}, '
f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
f'{self.report_df.loc[round_number]["val_metric_mean"]} '
f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
f'Num of features left: {len(remaining_features)}. '
f'Removed features at the end of the round: {features_to_remove}')
self.fitted = True
Matgrb marked this conversation as resolved.
Show resolved Hide resolved


Expand Down
1 change: 0 additions & 1 deletion probatus/interpret/model_interpret.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ def plot(self, plot_type, target_columns=None, samples_index=None, **plot_kwargs
elif plot_type == 'dependence':
ax = []
for feature_name in target_columns:
print()
ax.append(
self.tdp.plot(feature=feature_name, figsize=(10, 7), target_names=self.class_names))
plt.show()
Expand Down
28 changes: 19 additions & 9 deletions tests/feature_elimination/test_feature_elimination.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@ def y():
return pd.Series([1, 0, 1, 0, 1, 0, 1, 0], index=[1, 2, 3, 4, 5, 6, 7, 8])


def test_shap_rfe_randomized_search(X, y):
def test_shap_rfe_randomized_search(X, y, capsys):

clf = DecisionTreeClassifier(max_depth=1)
param_grid = {
'criterion': ['gini', 'entropy'],
'min_samples_split': [1, 2]
}
search = RandomizedSearchCV(clf, param_grid, cv=2)
with pytest.warns(None) as record:

shap_elimination = ShapRFECV(search, step=0.8, cv=2, scoring='roc_auc', n_jobs=4)

report = shap_elimination.fit_compute(X, y)
shap_elimination = ShapRFECV(search, step=0.8, cv=2, scoring='roc_auc', n_jobs=4, verbose=150)
report = shap_elimination.fit_compute(X, y)

assert shap_elimination.fitted == True
shap_elimination._check_if_fitted
Expand All @@ -39,14 +39,19 @@ def test_shap_rfe_randomized_search(X, y):

ax1 = shap_elimination.plot(show=False)

# Ensure that number of warnings was at least 2 for the verbose (2 generated by probatus + possibly more by SHAP)
assert len(record) >= 2

def test_shap_rfe(X, y):

clf = DecisionTreeClassifier(max_depth=1)
# Check if there is any prints
out, _ = capsys.readouterr()
assert len(out) > 0

shap_elimination = ShapRFECV(clf, random_state=1, step=1, cv=2, scoring='roc_auc', n_jobs=4)
def test_shap_rfe(X, y, capsys):

shap_elimination.fit(X, y)
clf = DecisionTreeClassifier(max_depth=1)
with pytest.warns(None) as record:
shap_elimination = ShapRFECV(clf, random_state=1, step=1, cv=2, scoring='roc_auc', n_jobs=4)
shap_elimination.fit(X, y)

assert shap_elimination.fitted == True
shap_elimination._check_if_fitted
Expand All @@ -58,6 +63,11 @@ def test_shap_rfe(X, y):

ax1 = shap_elimination.plot(show=False)

# Ensure that number of warnings was 0
assert len(record) == 0
# Check if there is any prints
out, _ = capsys.readouterr()
assert len(out) == 0

def test_calculate_number_of_features_to_remove():
assert 3 == ShapRFECV._calculate_number_of_features_to_remove(current_num_of_features=10,
Expand Down