ing-bank · operte · Apr 21, 2022 · Mar 8, 2022 · Mar 8, 2022 · Mar 20, 2022
diff --git a/probatus/feature_elimination/feature_elimination.py b/probatus/feature_elimination/feature_elimination.py
@@ -405,6 +405,7 @@ def fit(
         sample_weight=None,
         columns_to_keep=None,
         column_names=None,
+        groups=None,
         **shap_kwargs,
     ):
         """
@@ -443,6 +444,12 @@ def fit(
                 feature names. If not provided the existing feature names are used or default feature names are
                 generated.
 
+            groups (pd.Series, np.ndarray, list, optional):
+                array-like of shape (n_samples,)
+                Group labels for the samples used while splitting the dataset into train/test set.
+                Only used in conjunction with a "Group" `cv` instance.
+                (e.g. `sklearn.model_selection.GroupKFold`).
+
             **shap_kwargs:
                 keyword arguments passed to
                 [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
@@ -546,7 +553,7 @@ def fit(
                     sample_weight=sample_weight,
                     **shap_kwargs,
                 )
-                for train_index, val_index in self.cv.split(current_X, self.y)
+                for train_index, val_index in self.cv.split(current_X, self.y, groups)
             )
 
             shap_values = np.vstack([current_result[0] for current_result in results_per_fold])

diff --git a/tests/feature_elimination/test_feature_elimination.py b/tests/feature_elimination/test_feature_elimination.py
@@ -6,7 +6,7 @@
 from probatus.utils import preprocess_labels
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import get_scorer
-from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
+from sklearn.model_selection import RandomizedSearchCV, StratifiedGroupKFold, StratifiedKFold
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
@@ -58,6 +58,14 @@ def sample_weight():
     return pd.Series([1, 1, 1, 1, 1, 1, 1, 1], index=[1, 2, 3, 4, 5, 6, 7, 8])
 
 
+@pytest.fixture(scope="function")
+def groups():
+    """
+    Fixture for groups.
+    """
+    return pd.Series(["grp1", "grp1", "grp1", "grp1", "grp2", "grp2", "grp2", "grp2"], index=[1, 2, 3, 4, 5, 6, 7, 8])
+
+
 def test_shap_rfe_randomized_search(X, y, capsys):
     """
     Test with RandomizedSearchCV.
@@ -121,6 +129,42 @@ def test_shap_rfe(X, y, sample_weight, capsys):
     assert len(out) == 0
 
 
+def test_shap_rfe_group_cv(X, y, groups, sample_weight, capsys):
+    """
+    Test ShapRFECV with StratifiedGroupKFold.
+    """
+    clf = DecisionTreeClassifier(max_depth=1, random_state=1)
+    cv = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=1)
+    with pytest.warns(None) as record:
+        shap_elimination = ShapRFECV(
+            clf,
+            random_state=1,
+            step=1,
+            cv=cv,
+            scoring="roc_auc",
+            n_jobs=4,
+        )
+        shap_elimination = shap_elimination.fit(
+            X, y, groups=groups, sample_weight=sample_weight, approximate=True, check_additivity=False
+        )
+
+    assert shap_elimination.fitted
+    shap_elimination._check_if_fitted()
+
+    report = shap_elimination.compute()
+
+    assert report.shape[0] == 3
+    assert shap_elimination.get_reduced_features_set(1) == ["col_3"]
+
+    _ = shap_elimination.plot(show=False)
+
+    # Ensure that number of warnings was 0
+    assert len(record) == 0
+    # Check if there is any prints
+    out, _ = capsys.readouterr()
+    assert len(out) == 0
+
+
 def test_shap_pipeline_error(X, y, capsys):
     """
     Test with ShapRFECV for pipelines.