Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Lint code #35

Merged
merged 2 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@ init:
pip3 install poetry
poetry install

lint:
poetry run black .
poetry run isort .
poetry run flake8

test:
poetry run pytest tests --cov=sliceline --cov-report=xml:.github/reports/coverage.xml
poetry run coverage run -m pytest
poetry run coverage report -m

doc:
sphinx-build -a docs/source docs/build
Expand Down
31 changes: 15 additions & 16 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,36 @@
# Configuration file for the Sphinx documentation builder.
import sys
import os

import sys

sys.path.insert(0, os.path.abspath("../../"))

# -- Project information

project = 'Sliceline'
copyright = '2022, DataDome'
author = 'Antoine de Daran'
project = "Sliceline"
copyright = "2022, DataDome"
author = "Antoine de Daran"

# -- General configuration

extensions = [
'sphinx.ext.duration',
'sphinx.ext.doctest',
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.intersphinx',
"sphinx.ext.duration",
"sphinx.ext.doctest",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.intersphinx",
]

intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
"python": ("https://docs.python.org/3/", None),
"sphinx": ("https://www.sphinx-doc.org/en/master/", None),
}
intersphinx_disabled_domains = ['std']
intersphinx_disabled_domains = ["std"]

templates_path = ['_templates']
templates_path = ["_templates"]

# -- Options for HTML output

html_theme = 'sphinx_rtd_theme'
html_theme = "sphinx_rtd_theme"

# -- Options for EPUB output
epub_show_urls = 'footnote'
epub_show_urls = "footnote"
13 changes: 13 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,16 @@ pytest-benchmark = "^3.4.1"
pytest-cov = "^3.0.0"
Sphinx = "^4.0.0"
sphinx-rtd-theme = "^1.0.0"

[tool.black]
line-length = 79
include = '\.pyi?$'

[tool.isort]
profile = "black"

[tool.coverage.run]
omit = [".*", "*/site-packages/*", "tests/*", "*/validation.py"]

[tool.coverage.report]
fail_under = 80
6 changes: 4 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
[flake8]
exclude = validation
docstring-convention = numpy
format = pylint
extend-ignore = E203,W503,C901
max-line-length = 100
ignore = E203 W503
max-complexity = 10
2 changes: 1 addition & 1 deletion sliceline/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .slicefinder import Slicefinder

__all__ = ("Slicefinder",)
__all__ = ("Slicefinder",)
103 changes: 73 additions & 30 deletions sliceline/slicefinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):

Given an input dataset (`X`) and a model error vector (`errors`), SliceLine finds
the `k` slices in `X` that identify where the model performs significantly worse.
A slice is a subspace of `X` defined by one or more predicates. The maximal dimension
of this subspace is controlled by `max_l`.
A slice is a subspace of `X` defined by one or more predicates.
The maximal dimension of this subspace is controlled by `max_l`.

The slice scoring function is the linear combination of two objectives:
- Find sufficiently large slices, with more than `min_sup` elements
Expand Down Expand Up @@ -55,7 +55,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):
min_sup: int or float, default=10
Minimum support threshold.
Inspired by frequent itemset mining, it ensures statistical significance.
If `min_sup` is a float (0 < `min_sup` < 1), it represents the faction of the input dataset (`X`)
If `min_sup` is a float (0 < `min_sup` < 1),
it represents the faction of the input dataset (`X`).

verbose: bool, default=True
Controls the verbosity.
Expand All @@ -71,7 +72,8 @@ class Slicefinder(BaseEstimator, TransformerMixin):

References
----------
`SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging <https://mboehm7.github.io/resources/sigmod2021b_sliceline.pdf>`__,
`SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging
<https://mboehm7.github.io/resources/sigmod2021b_sliceline.pdf>`__,
from *Svetlana Sagadeeva* and *Matthias Boehm* of Graz University of Technology.
"""

Expand Down Expand Up @@ -108,9 +110,8 @@ def _check_params(self):
if self.max_l <= 0:
raise ValueError(f"Invalid 'max_l' parameter: {self.max_l}")

if (
self.min_sup < 0 or
(isinstance(self.min_sup, float) and self.min_sup >= 1)
if self.min_sup < 0 or (
isinstance(self.min_sup, float) and self.min_sup >= 1
):
raise ValueError(f"Invalid 'min_sup' parameter: {self.min_sup}")

Expand Down Expand Up @@ -228,9 +229,9 @@ def _get_slices_masks(self, X):
slice_candidates = self._top_slices_enc @ X_encoded.T

# self._top_slices_enc.sum(axis=1) is the number of predicate(s) for each top_slices_
slices_masks = (slice_candidates == self._top_slices_enc.sum(axis=1)).A.astype(
int
)
slices_masks = (
slice_candidates == self._top_slices_enc.sum(axis=1)
).A.astype(int)

return slices_masks

Expand All @@ -243,8 +244,12 @@ def _n_features_out(self):
def _dummify(array: np.ndarray, n_col_x_encoded: int) -> sp.csr_matrix:
"""Dummify `array` with respect to `n_col_x_encoded`.
Assumption: v does not contain any 0."""
assert 0 not in array, "Modality 0 is not expected to be one-hot encoded."
one_hot_encoding = sp.lil_matrix((array.size, n_col_x_encoded), dtype=bool)
assert (
0 not in array
), "Modality 0 is not expected to be one-hot encoded."
one_hot_encoding = sp.lil_matrix(
(array.size, n_col_x_encoded), dtype=bool
)
one_hot_encoding[np.arange(array.size), array - 1] = True
return one_hot_encoding.tocsr()

Expand All @@ -257,22 +262,28 @@ def _maintain_top_k(
) -> Tuple[sp.csr_matrix, np.ndarray]:
"""Add new `slices` to `top_k_slices` and update the top-k slices."""
# prune invalid min_sup and scores
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (statistics[:, 0] > 0)
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
statistics[:, 0] > 0
)
if np.sum(valid_slices_mask) != 0:
slices, statistics = (
slices[valid_slices_mask],
statistics[valid_slices_mask],
)

if (slices.shape[1] != top_k_slices.shape[1]) & (slices.shape[1] == 1):
if (slices.shape[1] != top_k_slices.shape[1]) & (
slices.shape[1] == 1
):
slices, statistics = slices.T, statistics.T

# evaluated candidates and previous top-k
slices = sp.vstack([top_k_slices, slices])
statistics = np.concatenate([top_k_statistics, statistics])

# extract top-k
top_slices_bool = rankdata(-statistics[:, 0], method="min") <= self.k
top_slices_bool = (
rankdata(-statistics[:, 0], method="min") <= self.k
)
top_k_slices, top_k_statistics = (
slices[top_slices_bool],
statistics[top_slices_bool],
Expand All @@ -298,7 +309,9 @@ def _score_ub(
potential_solutions = np.column_stack(
(
self.min_sup * np.ones(slice_sizes_ub.shape[0]),
np.maximum(slice_errors_ub / max_slice_errors_ub, self.min_sup),
np.maximum(
slice_errors_ub / max_slice_errors_ub, self.min_sup
),
slice_sizes_ub,
)
)
Expand All @@ -307,7 +320,8 @@ def _score_ub(
self.alpha
* (
np.minimum(
potential_solutions.T * max_slice_errors_ub, slice_errors_ub
potential_solutions.T * max_slice_errors_ub,
slice_errors_ub,
).T
/ self.average_error_
- potential_solutions
Expand All @@ -325,7 +339,9 @@ def _analyse_top_k(top_k_statistics: np.ndarray) -> tuple:
max_slice_scores = min_slice_scores = -np.inf
if top_k_statistics.shape[0] > 0:
max_slice_scores = top_k_statistics[0, 0]
min_slice_scores = top_k_statistics[top_k_statistics.shape[0] - 1, 0]
min_slice_scores = top_k_statistics[
top_k_statistics.shape[0] - 1, 0
]
return max_slice_scores, min_slice_scores

def _score(
Expand Down Expand Up @@ -354,7 +370,9 @@ def _eval_slice(
max_slice_errors = slice_candidates.T.multiply(errors).max(axis=1).A

# score of relative error and relative size
slice_scores = self._score(slice_sizes, slice_errors, x_encoded.shape[0])
slice_scores = self._score(
slice_sizes, slice_errors, x_encoded.shape[0]
)
return np.column_stack(
[slice_scores, slice_errors, max_slice_errors, slice_sizes]
)
Expand All @@ -379,7 +397,9 @@ def _create_and_score_basic_slices(
slices = self._dummify(attr, n_col_x_encoded)

# score 1-slices and create initial top-k
slice_scores = self._score(slice_sizes, slice_errors, x_encoded.shape[0])
slice_scores = self._score(
slice_sizes, slice_errors, x_encoded.shape[0]
)
statistics = np.column_stack(
(slice_scores, slice_errors, max_slice_errors, slice_sizes)
)
Expand All @@ -397,19 +417,25 @@ def _get_pruned_s_r(
) -> Tuple[sp.csr_matrix, np.ndarray]:
"""Prune invalid slices.
Do not affect overall pruning effectiveness due to handling of missing parents."""
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
valid_slices_mask = (statistics[:, 3] >= self.min_sup) & (
statistics[:, 1] > 0
)
return slices[valid_slices_mask], statistics[valid_slices_mask]

@staticmethod
def _join_compatible_slices(slices: sp.csr_matrix, level: int) -> np.ndarray:
def _join_compatible_slices(
slices: sp.csr_matrix, level: int
) -> np.ndarray:
"""Join compatible slices according to `level`."""
slices_int = slices.astype(int)
join = (slices_int @ slices_int.T).A == level - 2
return np.triu(join, 1) * join

@staticmethod
def _combine_slices(
slices: sp.csr_matrix, statistics: np.ndarray, compatible_slices: np.ndarray
slices: sp.csr_matrix,
statistics: np.ndarray,
compatible_slices: np.ndarray,
) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray, np.ndarray]:
"""Combine slices by exploiting parents node statistics."""
parent_1_idx, parent_2_idx = np.where(compatible_slices == 1)
Expand Down Expand Up @@ -459,7 +485,9 @@ def _prepare_deduplication_and_pruning(
"""Prepare IDs for deduplication and pruning."""
ids = np.zeros(pair_candidates.shape[0])
dom = feature_domains + 1
for j, (start, end) in enumerate(zip(feature_offset_start, feature_offset_end)):
for j, (start, end) in enumerate(
zip(feature_offset_start, feature_offset_end)
):
sub_pair_candidates = pair_candidates[:, start:end]
# sub_p should not contain multiple True on the same line
i = sub_pair_candidates.argmax(axis=1).T + np.any(
Expand Down Expand Up @@ -510,7 +538,10 @@ def _get_pair_candidates(
return sp.csr_matrix(np.empty((0, slices.shape[1])))

ids = self._prepare_deduplication_and_pruning(
feature_offset_start, feature_offset_end, feature_domains, pair_candidates
feature_offset_start,
feature_offset_end,
feature_domains,
pair_candidates,
)

# remove duplicate candidates and select corresponding statistics
Expand Down Expand Up @@ -579,7 +610,9 @@ def _search_slices(
np.zeros((0, 4)),
)

max_slice_scores, min_slice_scores = self._analyse_top_k(top_k_statistics)
max_slice_scores, min_slice_scores = self._analyse_top_k(
top_k_statistics
)
logger.debug(
"Initial top-K: count=%i, max=%f, min=%f"
% (top_k_slices.shape[0], max_slice_scores, min_slice_scores)
Expand All @@ -589,7 +622,11 @@ def _search_slices(
# termination condition (max #feature levels)
level = 1
min_condition = min(input_x.shape[1], self.max_l)
while (slices.shape[0] > 0) & (slices.sum() > 0) & (level < min_condition):
while (
(slices.shape[0] > 0)
& (slices.sum() > 0)
& (level < min_condition)
):
level += 1

# enumerate candidate join pairs, including size/error pruning
Expand Down Expand Up @@ -620,8 +657,12 @@ def _search_slices(
slices, statistics, top_k_slices, top_k_statistics
)

max_slice_scores, min_slice_scores = self._analyse_top_k(top_k_statistics)
valid = np.sum((statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0))
max_slice_scores, min_slice_scores = self._analyse_top_k(
top_k_statistics
)
valid = np.sum(
(statistics[:, 3] >= self.min_sup) & (statistics[:, 1] > 0)
)
logger.debug(
" -- valid slices after eval: %s/%i" % (valid, slices.shape[0])
)
Expand All @@ -634,6 +675,8 @@ def _search_slices(
if top_k_slices.shape[0] == 0:
self.top_slices_ = np.empty((0, input_x.shape[1]))
else:
self.top_slices_ = self._one_hot_encoder.inverse_transform(top_k_slices)
self.top_slices_ = self._one_hot_encoder.inverse_transform(
top_k_slices
)

logger.debug("Terminated at level %i." % level)
Loading