Skip to content

Commit

Permalink
Merge branch 'support_search_cv' of https://github.com/ing-bank/probatus
Browse files Browse the repository at this point in the history
 into support_search_cv
  • Loading branch information
mgarbacz committed Mar 9, 2021
2 parents de3806b + 086ef81 commit 473017e
Show file tree
Hide file tree
Showing 37 changed files with 1,449 additions and 663 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/publish_to_pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v1
uses: actions/setup-python@v2
with:
python-version: 3.7
fetch-depth: 0
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand All @@ -31,5 +32,4 @@ jobs:
twine upload dist/*
- name: Deploy mkdocs site
run: |
pip install mkdocs-material mknotebooks mkdocstrings
mkdocs gh-deploy --force
34 changes: 34 additions & 0 deletions .pre-commit-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
repos:
- repo: local
hooks:
- id: black
name: black
entry: black
language: python
types: [python]
language_version: python3.8
args: [--line-length=120]
- repo: local
hooks:
- id: mypy
name: mypy
entry: mypy
language: system
types: [python]
args: [--ignore-missing-imports, --namespace-packages, --show-error-codes, --pretty]
- repo: local
hooks:
- id: flake8
name: flake8
entry: flake8
language: system
types: [python]
args: [--max-line-length=120, --docstring-convention=google, "--ignore=D100,D104,D212,D200,E203,W293,D412,W503"]
# D100 requires all Python files (modules) to have a "public" docstring even if all functions within have a docstring.
# D104 requires __init__ files to have a docstring
# D212
# D200
# D412 No blank lines allowed between a section header and its content
# E203
# W293 blank line contains whitespace
# W503 line break before binary operator (for compatibility with black)
6 changes: 6 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ Unit testing:
pytest
```

We use [pre-commit](https://pre-commit.com/) hooks to ensure code styling. Install with:

```shell
pre-commit install
```

## Standards

- Python 3.6+
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
<img src="https://github.com/ing-bank/probatus/raw/main/docs/img/logo_large.png" width="120" align="right">

[![pytest](https://github.com/ing-bank/probatus/workflows/Release/badge.svg)](https://github.com/ing-bank/probatus/actions?query=workflow%3A%22Release%22)
[![pytest](https://github.com/ing-bank/probatus/workflows/Development/badge.svg)](https://github.com/ing-bank/probatus/actions?query=workflow%3A%22Development%22)
[![codecov](https://codecov.io/gh/ing-bank/probatus/branch/main/graph/badge.svg?token=OFE2YWHLFK)](https://codecov.io/gh/ing-bank/probatus)
[![PyPi Version](https://img.shields.io/pypi/pyversions/probatus)](#)
Expand Down
2 changes: 1 addition & 1 deletion probatus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

name = "probatus"
name = "probatus"
14 changes: 12 additions & 2 deletions probatus/binning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


from .binning import SimpleBucketer, AgglomerativeBucketer, QuantileBucketer, TreeBucketer
from .binning import (
SimpleBucketer,
AgglomerativeBucketer,
QuantileBucketer,
TreeBucketer,
)

__all__ = ['SimpleBucketer', 'AgglomerativeBucketer', 'QuantileBucketer', 'TreeBucketer']
__all__ = [
"SimpleBucketer",
"AgglomerativeBucketer",
"QuantileBucketer",
"TreeBucketer",
]
99 changes: 63 additions & 36 deletions probatus/binning/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,17 @@
from sklearn.cluster import AgglomerativeClustering
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.validation import check_is_fitted
from probatus.utils import assure_numpy_array, TreePathFinder, ApproximationWarning, BaseFitComputeClass
from probatus.utils import (
assure_numpy_array,
TreePathFinder,
ApproximationWarning,
BaseFitComputeClass,
)
import warnings
from abc import abstractmethod

class Bucketer(BaseFitComputeClass):

class Bucketer(BaseFitComputeClass):
def __repr__(self):
repr_ = f"{self.__class__.__name__}\n\tbincount: {self.bin_count}"
if hasattr(self, "boundaries_"):
Expand All @@ -41,15 +46,19 @@ def fit(self):

@property
def boundaries(self):
warnings.warn("The 'boundaries' attribute is deprecated, use 'boundaries_' instead. The underscore suffix signals this is a fitted attribute.",
DeprecationWarning)
warnings.warn(
"The 'boundaries' attribute is deprecated, use 'boundaries_' instead. The underscore suffix signals this is a fitted attribute.",
DeprecationWarning,
)
check_is_fitted(self)
return self.boundaries_

@property
def counts(self):
warnings.warn("The 'counts' attribute is deprecated, use 'counts_' instead. The underscore suffix signals this is a fitted attribute.",
DeprecationWarning)
warnings.warn(
"The 'counts' attribute is deprecated, use 'counts_' instead. The underscore suffix signals this is a fitted attribute.",
DeprecationWarning,
)
check_is_fitted(self)
return self.counts_

Expand All @@ -65,18 +74,22 @@ def compute(self, X, y=None):
"""
check_is_fitted(self)

# np.digitize returns the indices of the bins to which each value in input array belongs
# the smallest value of the `boundaries` attribute equals the lowest value in the set the instance was
# fitted on, to prevent the smallest value of x_new to be in his own bucket, we ignore the first boundary
# value
digitize_result = np.digitize(X, self.boundaries_[1:], right=True)
result = pd.DataFrame({'bucket': digitize_result}).groupby('bucket')['bucket'].count()
result = (
pd.DataFrame({"bucket": digitize_result})
.groupby("bucket")["bucket"]
.count()
)
# reindex the dataframe such that also empty buckets are included in the result
result = result.reindex(np.arange(self.bin_count), fill_value=0)
return result.values

def fit_compute(self, X, y = None):
def fit_compute(self, X, y=None):
"""
Apply bucketing to new data and return number of samples per bin
Expand Down Expand Up @@ -108,6 +121,7 @@ class SimpleBucketer(Bucketer):
myBucketer.counts gives the number of elements per bucket
myBucketer.boundaries gives the boundaries of the buckets
"""

def __init__(self, bin_count):
self.bin_count = bin_count

Expand Down Expand Up @@ -146,21 +160,28 @@ class AgglomerativeBucketer(Bucketer):
myBucketer.counts gives the number of elements per bucket
myBucketer.boundaries gives the boundaries of the buckets
"""

def __init__(self, bin_count):
self.bin_count = bin_count

@staticmethod
def agglomerative_clustering_binning(x, bin_count):
clustering = AgglomerativeClustering(n_clusters=bin_count).fit(np.asarray(x).reshape(-1, 1))
df = pd.DataFrame({'x': x, 'label': clustering.labels_}).sort_values(by='x')
cluster_minimum_values = df.groupby('label')['x'].min().sort_values().tolist()
cluster_maximum_values = df.groupby('label')['x'].max().sort_values().tolist()
clustering = AgglomerativeClustering(n_clusters=bin_count).fit(
np.asarray(x).reshape(-1, 1)
)
df = pd.DataFrame({"x": x, "label": clustering.labels_}).sort_values(by="x")
cluster_minimum_values = df.groupby("label")["x"].min().sort_values().tolist()
cluster_maximum_values = df.groupby("label")["x"].max().sort_values().tolist()
# take the mean of the upper boundary of a cluster and the lower boundary of the next cluster
boundaries = [np.mean([cluster_minimum_values[i + 1], cluster_maximum_values[i]]) for i in
range(len(cluster_minimum_values) - 1)]
boundaries = [
np.mean([cluster_minimum_values[i + 1], cluster_maximum_values[i]])
for i in range(len(cluster_minimum_values) - 1)
]
# add the lower boundary of the lowest cluster and the upper boundary of the highest cluster
boundaries = [cluster_minimum_values[0]] + boundaries + [cluster_maximum_values[-1]]
counts = df.groupby('label')['label'].count().values
boundaries = (
[cluster_minimum_values[0]] + boundaries + [cluster_maximum_values[-1]]
)
counts = df.groupby("label")["label"].count().values
return counts, boundaries

def fit(self, x, y=None):
Expand All @@ -173,7 +194,9 @@ def fit(self, x, y=None):
Returns: fitted bucketer object
"""
self.counts_, self.boundaries_ = self.agglomerative_clustering_binning(x, self.bin_count)
self.counts_, self.boundaries_ = self.agglomerative_clustering_binning(
x, self.bin_count
)
return self


Expand All @@ -193,23 +216,26 @@ class QuantileBucketer(Bucketer):
myBucketer.counts gives the number of elements per bucket
myBucketer.boundaries gives the boundaries of the buckets
"""

def __init__(self, bin_count):
self.bin_count = bin_count

@staticmethod
def quantile_bins(x, bin_count, inf_edges=False):

try:
out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates='raise')
out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates="raise")
except ValueError:
# If there are too many duplicate values (assume a lot of filled missings)
# this crashes - the exception drops them.
# This means that it will return approximate quantile bins
out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates='drop')
warnings.warn(ApproximationWarning("Approximated quantiles - too many unique values" ))
df = pd.DataFrame({'x': x})
df['label'] = out
counts = df.groupby('label').count().values.flatten()
out, boundaries = pd.qcut(x, q=bin_count, retbins=True, duplicates="drop")
warnings.warn(
ApproximationWarning("Approximated quantiles - too many unique values")
)
df = pd.DataFrame({"x": x})
df["label"] = out
counts = df.groupby("label").count().values.flatten()
if inf_edges:
boundaries[0] = -np.inf
boundaries[-1] = np.inf
Expand All @@ -235,7 +261,7 @@ class TreeBucketer(Bucketer):
Useful if the buckets be defined such that there is a substantial difference between the buckets in
the distribution of the target.
Usage:
```python
from probatus.binning import TreeBucketer
Expand Down Expand Up @@ -321,27 +347,26 @@ class TreeBucketer(Bucketer):
"""

def __init__(self, inf_edges = False, tree = None, **tree_kwargs):
def __init__(self, inf_edges=False, tree=None, **tree_kwargs):
self.bin_count = -1
self.inf_edges = inf_edges
if tree is None:
self.tree = DecisionTreeClassifier(**tree_kwargs)
else:
self.tree = tree


@staticmethod
def tree_bins(x, y, inf_edges, tree):

X_in = assure_numpy_array(x).reshape(-1, 1)
y_in = assure_numpy_array(y).reshape(-1, 1)
tree.fit(X_in,y_in)
tree.fit(X_in, y_in)

if tree.min_samples_leaf>=X_in.shape[0]:
if tree.min_samples_leaf >= X_in.shape[0]:
error_msg = (
"Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m" +
f"Currently min_samples_leaf {tree.min_samples_leaf} " +
f"and the length of X is {X_in.shape[0]}"
"Cannot Fit decision tree. min_samples_leaf must be < than the length of x.m"
+ f"Currently min_samples_leaf {tree.min_samples_leaf} "
+ f"and the length of X is {X_in.shape[0]}"
)
raise ValueError(error_msg)

Expand All @@ -351,8 +376,8 @@ def tree_bins(x, y, inf_edges, tree):
bin_count = len(index)

tpf = TreePathFinder(tree)
boundaries = [bound['min'] for bound in tpf.get_boundaries().values()]
boundaries += [tpf.get_boundaries()[leaves[-1]]['max']]
boundaries = [bound["min"] for bound in tpf.get_boundaries().values()]
boundaries += [tpf.get_boundaries()[leaves[-1]]["max"]]

if not inf_edges:
boundaries[0] = np.min(X_in)
Expand All @@ -370,5 +395,7 @@ def fit(self, X, y):
Returns: fitted bucketer object
"""
self.counts_, self.boundaries_, self.bin_count, self.tree = self.tree_bins(X,y, self.inf_edges, self.tree)
return self
self.counts_, self.boundaries_, self.bin_count, self.tree = self.tree_bins(
X, y, self.inf_edges, self.tree
)
return self
2 changes: 1 addition & 1 deletion probatus/feature_elimination/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@

from .feature_elimination import ShapRFECV

__all__ = ['ShapRFECV']
__all__ = ["ShapRFECV"]
Loading

0 comments on commit 473017e

Please sign in to comment.