Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor KNN Shapley #610

Merged
merged 14 commits into from
Jul 28, 2024
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Added

- Refactoring KNN Shapley values with the new sampler architecture [PR #610](https://github.com/aai-institute/pyDVL/pull/610).
- Refactoring MSR Banzhaf semivalues with the new sampler architecture.
[PR #605](https://github.com/aai-institute/pyDVL/pull/605)
- Refactoring group-testing shapley values with new sampler architecture
Expand Down Expand Up @@ -36,7 +37,7 @@
[PR #597](https://github.com/aai-institute/pyDVL/pull/597)
- Fix a bug in the calculation of variance estimates for MSR Banzhaf
[PR #605](https://github.com/aai-institute/pyDVL/pull/605)

- Fix a bug in KNN Shapley values. See [Issue 613](https://github.com/aai-institute/pyDVL/issues/613) for details.


### Changed
Expand Down
141 changes: 141 additions & 0 deletions src/pydvl/valuation/methods/knn_shapley.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,144 @@
the VLDB Endowment, Vol. 12, No. 11, pp. 1610–1623.

"""
from __future__ import annotations

import numpy as np
from joblib import Parallel, delayed
from numpy.typing import NDArray
from sklearn.neighbors import NearestNeighbors
from tqdm.auto import tqdm
from typing_extensions import Self

from pydvl.utils.status import Status
from pydvl.valuation.base import Valuation
from pydvl.valuation.dataset import Dataset
from pydvl.valuation.result import ValuationResult
from pydvl.valuation.utility import KNNClassifierUtility


class KNNShapleyValuation(Valuation):
"""Computes exact Shapley values for a KNN classifier.

This implements the method described in
(Jia, R. et al., 2019)<sup><a href="#jia_efficient_2019a">1</a></sup>.
It exploits the local structure of K-Nearest Neighbours to reduce the number
of calls to the utility function to a constant number per index, thus
reducing computation time to $O(n)$.

Args:
utility: KNNUtility with a KNN model to extract parameters from. The object
will not be modified nor used other than to call [get_params()](
<https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator.get_params>)
progress: Whether to display a progress bar.

"""

def __init__(self, utility: KNNClassifierUtility, progress: bool = True):
janosg marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()
self.utility = utility
self.progress = progress

config = self.utility.model.get_params()
self.n_neighbors = config["n_neighbors"]

del config["weights"]
self.helper_model = NearestNeighbors(**config)

def fit(self, data: Dataset) -> Self:
"""Calculate exact shapley values for a KNN model on a dataset.

This fit method bypasses direct evaluations of the utility function and
calculates the Shapley values directly.

In contrast to other data valuation models, the runtime increases linearly
with the size of the test dataset.

Calculating the KNN valuation is a computationally expensive task that
can be parallelized. To do so, call the `fit()` method inside a
`joblib.parallel_config` context manager as follows:

```python
from joblib import parallel_config

with parallel_config(n_jobs=4):
valuation.fit(data)
```

"""
self.helper_model = self.helper_model.fit(data.x)
n_obs = len(data.x)
n_test = len(self.utility.test_data)

generator = zip(self.utility.test_data.x, self.utility.test_data.y)

generator_with_progress = tqdm(
generator,
total=n_test,
disable=not self.progress,
position=0,
)

with Parallel(return_as="generator") as parallel:
results = parallel(
delayed(self._compute_values_for_one_test_point)(
self.helper_model, x, y, data.y
)
for x, y in generator_with_progress
)
values = np.zeros(n_obs)
for res in results:
values += res
values /= n_test

res = ValuationResult(
algorithm="knn_shapley",
status=Status.Converged,
values=values,
data_names=data.data_names,
)

self.result = res
return self

@staticmethod
def _compute_values_for_one_test_point(
helper_model: NearestNeighbors, x: NDArray, y: int, y_train: NDArray
) -> np.ndarray:
"""Compute the Shapley value for a single test data point.

The shapley values of the whole test set are the average of the shapley values
of the single test data points.

Args:
helper_model: A fitted NearestNeighbors model.
x: A single test data point.
y: The correct label of the test data point.
y_train: The training labels.

Returns:
The Shapley values for the test data point.

"""
n_obs = len(y_train)
n_neighbors = helper_model.get_params()["n_neighbors"]

# sorts data indices from close to far
sorted_indices = helper_model.kneighbors(
x.reshape(1, -1), n_neighbors=n_obs, return_distance=False
)[0]

values = np.zeros(n_obs)

idx = sorted_indices[-1]
values[idx] = float(y_train[idx] == y) / n_obs
# reverse range because we want to go from far to close
for i in range(n_obs - 1, 0, -1):
prev_idx = sorted_indices[i]
idx = sorted_indices[i - 1]
values[idx] = values[prev_idx]
values[idx] += (int(y_train[idx] == y) - int(y_train[prev_idx] == y)) / max(
n_neighbors, i
)

return values
6 changes: 3 additions & 3 deletions src/pydvl/valuation/methods/msr_banzhaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,6 @@ def _combine_results(
estimates, misleading update counts and even wrong values if no further
precaution is taken.

TODO: Verify that the two running means are statistically independent (which is
assumed in the aggregation of variances).

Args:
pos_result: The result of the positive updates.
neg_result: The result of the negative updates.
Expand All @@ -172,6 +169,9 @@ def _combine_results(
Returns:
The combined valuation result.

TODO: Verify that the two running means are statistically independent (which is
assumed in the aggregation of variances).

"""
# define counts as minimum of the two counts (see docstring)
counts = np.minimum(pos_result.counts, neg_result.counts)
Expand Down
6 changes: 3 additions & 3 deletions src/pydvl/valuation/samplers/msr.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Callable
from typing import Callable, List

import numpy as np

Expand Down Expand Up @@ -71,15 +71,15 @@ class MSREvaluationStrategy(EvaluationStrategy):

def process(
self, batch: SampleBatch, is_interrupted: NullaryPredicate
) -> list[ValueUpdate]:
) -> List[MSRValueUpdate]:
updates = []
for sample in batch:
updates.extend(self._process_sample(sample))
if is_interrupted():
break
return updates

def _process_sample(self, sample: Sample) -> list[ValueUpdate]:
def _process_sample(self, sample: Sample) -> List[MSRValueUpdate]:
u_value = self.utility(sample)
mask = np.zeros(self.n_indices, dtype=bool)
mask[sample.subset] = True
Expand Down
1 change: 1 addition & 0 deletions src/pydvl/valuation/scorers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base import *
from .classwise import *
from .knn import *
from .supervised import *
from .utils import *
41 changes: 41 additions & 0 deletions src/pydvl/valuation/scorers/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Specialized scorer for k-nearest neighbors models."""

from typing import cast

janosg marked this conversation as resolved.
Show resolved Hide resolved
import numpy as np
from numpy.typing import NDArray
from sklearn.neighbors import KNeighborsClassifier

from pydvl.valuation.scorers import SupervisedScorer, SupervisedScorerCallable


class KNNClassifierScorer(SupervisedScorer):
"""Scorer for KNN classifier models based on the KNN likelihood.

Typically, users will not create instances of this class directly but indirectly
by using `pydvl.valuation.utility.KNNUtility`.

Args:
test_data: The test data to evaluate the model on.

"""

def __init__(self, test_data):
def scoring(model: KNeighborsClassifier, X: NDArray, y: NDArray) -> float:
probs = model.predict_proba(X)
label_to_pos = {label: i for i, label in enumerate(model.classes_)}
likelihoods = []
for i in range(len(y)):
if y[i] not in label_to_pos:
likelihoods.append(0.0)
else:
likelihoods.append(probs[i, label_to_pos[y[i]]])
return cast(float, np.mean(likelihoods))

super().__init__(
scoring=cast(SupervisedScorerCallable, scoring),
test_data=test_data,
default=0.0,
range=(0, 1),
name="KNN Scorer",
)
3 changes: 2 additions & 1 deletion src/pydvl/valuation/utility/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@
compute the score.
"""

from .modelutility import * # isort: skip
from .knn import *
from .learning import *
from .modelutility import *
67 changes: 67 additions & 0 deletions src/pydvl/valuation/utility/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

from sklearn.neighbors import KNeighborsClassifier

from pydvl.utils.caching import CacheBackend, CachedFuncConfig
from pydvl.valuation.dataset import Dataset
from pydvl.valuation.scorers import KNNClassifierScorer
from pydvl.valuation.types import Sample
from pydvl.valuation.utility import ModelUtility

__all__ = ["KNNClassifierUtility"]


class KNNClassifierUtility(ModelUtility[Sample, KNeighborsClassifier]):
"""Utility object for KNN Classifiers.

The utility function is the likelihood of the true class given the model's
prediction.

This works both as a Utility object for general game theoretic valuation methods and
for specialized valuation methods for KNN classifiers.

Args:
model: A KNN classifier model.
test_data: The test data to evaluate the model on.
catch_errors: set to `True` to catch the errors when `fit()` fails. This
could happen in several steps of the pipeline, e.g. when too little
training data is passed, which happens often during Shapley value
calculations. When this happens, the [scorer's default
value][pydvl.valuation.scorers.SupervisedScorer] is returned as a score and
computation continues.
show_warnings: Set to `False` to suppress warnings thrown by `fit()`.
cache_backend: Optional instance of [CacheBackend][
pydvl.utils.caching.base.CacheBackend] used to wrap the _utility method of
the Utility instance. By default, this is set to None and that means that
the utility evaluations will not be cached.
cached_func_options: Optional configuration object for cached utility
evaluation.
clone_before_fit: If `True`, the model will be cloned before calling
`fit()`.

"""

def __init__(
self,
model: KNeighborsClassifier,
test_data: Dataset,
*,
catch_errors: bool = True,
show_warnings: bool = False,
cache_backend: CacheBackend | None = None,
cached_func_options: CachedFuncConfig | None = None,
schroedk marked this conversation as resolved.
Show resolved Hide resolved
clone_before_fit: bool = True,
):
scorer = KNNClassifierScorer(test_data)

self.test_data = test_data

super().__init__(
model=model,
scorer=scorer,
catch_errors=catch_errors,
show_warnings=show_warnings,
cache_backend=cache_backend,
cached_func_options=cached_func_options,
clone_before_fit=clone_before_fit,
)
1 change: 0 additions & 1 deletion src/pydvl/valuation/utility/modelutility.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ def _utility(self, sample: SampleT) -> float:
model or the scorer returns [numpy.NaN][]. Otherwise, the score
of the model.
"""

if self.training_data is None:
raise ValueError("No training data provided")

Expand Down
Loading