Merge branch 'support_search_cv' of https://github.com/ing-bank/probatus

into support_search_cv
ing-bank · Mar 9, 2021 · 473017e · 473017e
2 parents de3806b + 086ef81
commit 473017e
Show file tree

Hide file tree

Showing 37 changed files with 1,449 additions and 663 deletions.
diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml
@@ -8,11 +8,12 @@ jobs:
   deploy:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v2
     - name: Set up Python
-      uses: actions/setup-python@v1
+      uses: actions/setup-python@v2
       with:
         python-version: 3.7
+        fetch-depth: 0
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -31,5 +32,4 @@ jobs:
         twine upload dist/*
     - name: Deploy mkdocs site
       run: | 
-        pip install mkdocs-material mknotebooks mkdocstrings
         mkdocs gh-deploy --force
diff --git a/.pre-commit-config.yml b/.pre-commit-config.yml
@@ -0,0 +1,34 @@
+repos:
+  - repo: local
+    hooks:
+    - id: black
+      name: black
+      entry: black
+      language: python
+      types: [python]
+      language_version: python3.8
+      args: [--line-length=120]
+  - repo: local
+    hooks:
+    - id: mypy
+      name: mypy
+      entry: mypy
+      language: system
+      types: [python]
+      args: [--ignore-missing-imports, --namespace-packages, --show-error-codes, --pretty]
+  - repo: local
+    hooks:
+    - id: flake8
+      name: flake8
+      entry: flake8
+      language: system
+      types: [python]
+      args: [--max-line-length=120, --docstring-convention=google, "--ignore=D100,D104,D212,D200,E203,W293,D412,W503"]
+# D100 requires all Python files (modules) to have a "public" docstring even if all functions within have a docstring.
+# D104 requires __init__ files to have a docstring
+# D212
+# D200 
+# D412 No blank lines allowed between a section header and its content
+# E203
+# W293 blank line contains whitespace
+# W503 line break before binary operator (for compatibility with black)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -32,6 +32,12 @@ Unit testing:
 pytest
 ```
 
+We use [pre-commit](https://pre-commit.com/) hooks to ensure code styling. Install with:
+
+```shell
+pre-commit install
+```
+
 ## Standards
 
 - Python 3.6+

diff --git a/README.md b/README.md
@@ -1,6 +1,5 @@
 <img src="https://github.com/ing-bank/probatus/raw/main/docs/img/logo_large.png" width="120" align="right">
 
-[![pytest](https://github.com/ing-bank/probatus/workflows/Release/badge.svg)](https://github.com/ing-bank/probatus/actions?query=workflow%3A%22Release%22)
 [![pytest](https://github.com/ing-bank/probatus/workflows/Development/badge.svg)](https://github.com/ing-bank/probatus/actions?query=workflow%3A%22Development%22)
 [![codecov](https://codecov.io/gh/ing-bank/probatus/branch/main/graph/badge.svg?token=OFE2YWHLFK)](https://codecov.io/gh/ing-bank/probatus)
 [![PyPi Version](https://img.shields.io/pypi/pyversions/probatus)](#)

diff --git a/probatus/__init__.py b/probatus/__init__.py
@@ -17,4 +17,4 @@
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-name = "probatus"
+name = "probatus"
diff --git a/probatus/binning/__init__.py b/probatus/binning/__init__.py
@@ -18,6 +18,16 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
-from .binning import SimpleBucketer, AgglomerativeBucketer, QuantileBucketer, TreeBucketer
+from .binning import (
+    SimpleBucketer,
+    AgglomerativeBucketer,
+    QuantileBucketer,
+    TreeBucketer,
+)
 
-__all__ = ['SimpleBucketer', 'AgglomerativeBucketer', 'QuantileBucketer', 'TreeBucketer']
+__all__ = [
+    "SimpleBucketer",
+    "AgglomerativeBucketer",
+    "QuantileBucketer",
+    "TreeBucketer",
+]
diff --git a/probatus/binning/binning.py b/probatus/binning/binning.py
@@ -23,12 +23,17 @@
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.validation import check_is_fitted
-from probatus.utils import assure_numpy_array, TreePathFinder, ApproximationWarning, BaseFitComputeClass
+from probatus.utils import (
+    assure_numpy_array,
+    TreePathFinder,
+    ApproximationWarning,
+    BaseFitComputeClass,
+)
 import warnings
 from abc import abstractmethod
 
-class Bucketer(BaseFitComputeClass):
 
+class Bucketer(BaseFitComputeClass):
     def __repr__(self):
         repr_ = f"{self.__class__.__name__}\n\tbincount: {self.bin_count}"
         if hasattr(self, "boundaries_"):
@@ -41,15 +46,19 @@ def fit(self):
 
     @property
     def boundaries(self):
-        warnings.warn("The 'boundaries' attribute is deprecated, use 'boundaries_' instead. The underscore suffix signals this is a fitted attribute.",
-                       DeprecationWarning)
+        warnings.warn(
+            "The 'boundaries' attribute is deprecated, use 'boundaries_' instead. The underscore suffix signals this is a fitted attribute.",
+            DeprecationWarning,
+        )
         check_is_fitted(self)
         return self.boundaries_
 
     @property
     def counts(self):
-        warnings.warn("The 'counts' attribute is deprecated, use 'counts_' instead. The underscore suffix signals this is a fitted attribute.", 
-            DeprecationWarning)
+        warnings.warn(
+            "The 'counts' attribute is deprecated, use 'counts_' instead. The underscore suffix signals this is a fitted attribute.",
+            DeprecationWarning,
+        )
         check_is_fitted(self)
         return self.counts_
 
@@ -65,18 +74,22 @@ def compute(self, X, y=None):
 
         """
         check_is_fitted(self)
-        
+
         # np.digitize returns the indices of the bins to which each value in input array belongs
         # the smallest value of the `boundaries` attribute equals the lowest value in the set the instance was
         # fitted on, to prevent the smallest value of x_new to be in his own bucket, we ignore the first boundary
         # value
         digitize_result = np.digitize(X, self.boundaries_[1:], right=True)
-        result = pd.DataFrame({'bucket': digitize_result}).groupby('bucket')['bucket'].count()
+        result = (
+            pd.DataFrame({"bucket": digitize_result})
+            .groupby("bucket")["bucket"]
+            .count()
+        )
         # reindex the dataframe such that also empty buckets are included in the result
         result = result.reindex(np.arange(self.bin_count), fill_value=0)
         return result.values
 
-    def fit_compute(self, X, y = None):
+    def fit_compute(self, X, y=None):
         """
         Apply bucketing to new data and return number of samples per bin
 
@@ -108,6 +121,7 @@ class SimpleBucketer(Bucketer):
     myBucketer.counts gives the number of elements per bucket
     myBucketer.boundaries gives the boundaries of the buckets
     """
+
     def __init__(self, bin_count):
         self.bin_count = bin_count
 
@@ -146,21 +160,28 @@ class AgglomerativeBucketer(Bucketer):
     myBucketer.counts gives the number of elements per bucket
     myBucketer.boundaries gives the boundaries of the buckets
     """
+
     def __init__(self, bin_count):
         self.bin_count = bin_count
 
     @staticmethod
     def agglomerative_clustering_binning(x, bin_count):
-        clustering = AgglomerativeClustering(n_clusters=bin_count).fit(np.asarray(x).reshape(-1, 1))
-        df = pd.DataFrame({'x': x, 'label': clustering.labels_}).sort_values(by='x')
-        cluster_minimum_values = df.groupby('label')['x'].min().sort_values().tolist()
-        cluster_maximum_values = df.groupby('label')['x'].max().sort_values().tolist()
+        clustering = AgglomerativeClustering(n_clusters=bin_count).fit(
+            np.asarray(x).reshape(-1, 1)
+        )
+        df = pd.DataFrame({"x": x, "label": clustering.labels_}).sort_values(by="x")
+        cluster_minimum_values = df.groupby("label")["x"].min().sort_values().tolist()
+        cluster_maximum_values = df.groupby("label")["x"].max().sort_values().tolist()
         # take the mean of the upper boundary of a cluster and the lower boundary of the next cluster
-        boundaries = [np.mean([cluster_minimum_values[i + 1], cluster_maximum_values[i]]) for i in
-                      range(len(cluster_minimum_values) - 1)]
+        boundaries = [
+            np.mean([cluster_minimum_values[i + 1], cluster_maximum_values[i]])
+            for i in range(len(cluster_minimum_values) - 1)
+        ]
         # add the lower boundary of the lowest cluster and the upper boundary of the highest cluster
-        boundaries = [cluster_minimum_values[0]] + boundaries + [cluster_maximum_values[-1]]
-        counts = df.groupby('label')['label'].count().values
+        boundaries = (
+            [cluster_minimum_values[0]] + boundaries + [cluster_maximum_values[-1]]
+        )
+        counts = df.groupby("label")["label"].count().values
         return counts, boundaries
 
     def fit(self, x, y=None):
@@ -173,7 +194,9 @@ def fit(self, x, y=None):
 
         Returns: fitted bucketer object
         """
-        self.counts_, self.boundaries_ = self.agglomerative_clustering_binning(x, self.bin_count)
+        self.counts_, self.boundaries_ = self.agglomerative_clustering_binning(
+            x, self.bin_count
+        )
         return self
 
 
@@ -193,23 +216,26 @@ class QuantileBucketer(Bucketer):
     myBucketer.counts gives the number of elements per bucket
     myBucketer.boundaries gives the boundaries of the buckets
     """
+
     def __init__(self, bin_count):
         self.bin_count = bin_count
 
     @staticmethod
     def quantile_bins(x, bin_count, inf_edges=False):
 
         try:
-            out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates='raise')
+            out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates="raise")
         except ValueError:
             # If there are too many duplicate values (assume a lot of filled missings)
             # this crashes - the exception drops them.
             # This means that it will return approximate quantile bins
-            out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates='drop')
-            warnings.warn(ApproximationWarning("Approximated quantiles - too many unique values" ))
-        df = pd.DataFrame({'x': x})
-        df['label'] = out
-        counts = df.groupby('label').count().values.flatten()
+            out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates="drop")
+            warnings.warn(
+                ApproximationWarning("Approximated quantiles - too many unique values")
+            )
+        df = pd.DataFrame({"x": x})
+        df["label"] = out
+        counts = df.groupby("label").count().values.flatten()
         if inf_edges:
             boundaries[0] = -np.inf
             boundaries[-1] = np.inf
@@ -235,7 +261,7 @@ class TreeBucketer(Bucketer):
 
     Useful if the buckets be defined such that there is a substantial difference between the buckets in
     the distribution of the target.
-    
+
     Usage:
     ```python
     from probatus.binning import TreeBucketer
@@ -321,27 +347,26 @@ class TreeBucketer(Bucketer):
 
     """
 
-    def __init__(self, inf_edges = False, tree = None, **tree_kwargs):
+    def __init__(self, inf_edges=False, tree=None, **tree_kwargs):
         self.bin_count = -1
         self.inf_edges = inf_edges
         if tree is None:
             self.tree = DecisionTreeClassifier(**tree_kwargs)
         else:
             self.tree = tree
 
-
     @staticmethod
     def tree_bins(x, y, inf_edges, tree):
 
         X_in = assure_numpy_array(x).reshape(-1, 1)
         y_in = assure_numpy_array(y).reshape(-1, 1)
-        tree.fit(X_in,y_in)
+        tree.fit(X_in, y_in)
 
-        if tree.min_samples_leaf>=X_in.shape[0]:
+        if tree.min_samples_leaf >= X_in.shape[0]:
             error_msg = (
-                "Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m" +
-                f"Currently min_samples_leaf {tree.min_samples_leaf} " +
-                f"and the length of X is {X_in.shape[0]}"
+                "Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m"
+                + f"Currently min_samples_leaf {tree.min_samples_leaf} "
+                + f"and the length of X is {X_in.shape[0]}"
             )
             raise ValueError(error_msg)
 
@@ -351,8 +376,8 @@ def tree_bins(x, y, inf_edges, tree):
         bin_count = len(index)
 
         tpf = TreePathFinder(tree)
-        boundaries = [bound['min'] for bound in tpf.get_boundaries().values()]
-        boundaries += [tpf.get_boundaries()[leaves[-1]]['max']]
+        boundaries = [bound["min"] for bound in tpf.get_boundaries().values()]
+        boundaries += [tpf.get_boundaries()[leaves[-1]]["max"]]
 
         if not inf_edges:
             boundaries[0] = np.min(X_in)
@@ -370,5 +395,7 @@ def fit(self, X, y):
 
         Returns: fitted bucketer object
         """
-        self.counts_, self.boundaries_, self.bin_count, self.tree = self.tree_bins(X,y, self.inf_edges, self.tree)
-        return self
+        self.counts_, self.boundaries_, self.bin_count, self.tree = self.tree_bins(
+            X, y, self.inf_edges, self.tree
+        )
+        return self
diff --git a/probatus/feature_elimination/__init__.py b/probatus/feature_elimination/__init__.py
@@ -20,4 +20,4 @@
 
 from .feature_elimination import ShapRFECV
 
-__all__ = ['ShapRFECV']
+__all__ = ["ShapRFECV"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,4 +20,4 @@

		from .feature_elimination import ShapRFECV

		__all__ = ['ShapRFECV']
		__all__ = ["ShapRFECV"]