Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some docs explaining how to "analyze FS algorithm stability" #83

Merged
merged 9 commits into from
Oct 4, 2022
Merged
1,476 changes: 1,476 additions & 0 deletions examples/algorithm-stability-yaml/analyze-results.ipynb

Large diffs are not rendered by default.

111 changes: 111 additions & 0 deletions examples/algorithm-stability-yaml/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import Dict, Optional, Union

import hydra
import numpy as np
import pandas as pd
from skrebate import ReliefF

from fseval.config import PipelineConfig
from fseval.main import run_pipeline
from fseval.types import AbstractEstimator, AbstractMetric, Callback

"""
The checkInputType and getStability functions come from the following paper:

[1] On the Stability of Feature Selection. Sarah Nogueira, Konstantinos Sechidis, Gavin Brown.
Journal of Machine Learning Reasearch (JMLR). 2017.
You can find a full demo using this package at:
http://htmlpreview.github.io/?https://github.com/nogueirs/JMLR2017/blob/master/python/stabilityDemo.html
NB: This package requires the installation of the packages: numpy, scipy and math
"""


def checkInputType(Z):
"""This function checks that Z is of the rigt type and dimension.
It raises an exception if not.
OUTPUT: The input Z as a numpy.ndarray
"""
### We check that Z is a list or a numpy.array
if isinstance(Z, list):
Z = np.asarray(Z)
elif not isinstance(Z, np.ndarray):
raise ValueError("The input matrix Z should be of type list or numpy.ndarray")
### We check if Z is a matrix (2 dimensions)
if Z.ndim != 2:
raise ValueError("The input matrix Z should be of dimension 2")
return Z


def getStability(Z):
"""
Let us assume we have M>1 feature sets and d>0 features in total.
This function computes the stability estimate as given in Definition 4 in [1].

INPUT: A BINARY matrix Z (given as a list or as a numpy.ndarray of size M*d).
Each row of the binary matrix represents a feature set, where a 1 at the f^th position
means the f^th feature has been selected and a 0 means it has not been selected.

OUTPUT: The stability of the feature selection procedure
"""
Z = checkInputType(Z)
M, d = Z.shape
hatPF = np.mean(Z, axis=0)
kbar = np.sum(hatPF)
denom = (kbar / d) * (1 - kbar / d)
return 1 - (M / (M - 1)) * np.mean(np.multiply(hatPF, 1 - hatPF)) / denom


class StabilityNogueira(AbstractMetric):
def score_bootstrap(
self,
ranker: AbstractEstimator,
validator: AbstractEstimator,
callbacks: Callback,
scores: Dict,
**kwargs,
) -> Dict:
# compute stability and send to table
Z = np.array(self.support_matrix)
Z = Z.astype(int)
stability = getStability(Z)
stability_df = pd.DataFrame([{"stability": stability}])
callbacks.on_table(stability_df, "stability")

# set in scores dict
scores["stability"] = stability

return scores

def score_ranking(
self,
scores: Union[Dict, pd.DataFrame],
ranker: AbstractEstimator,
bootstrap_state: int,
callbacks: Callback,
feature_importances: Optional[np.ndarray] = None,
):
support_matrix = getattr(self, "support_matrix", [])
self.support_matrix = support_matrix
self.support_matrix.append(ranker.feature_support_)


class ReliefF_FeatureSelection(ReliefF):
def fit(self, X, y):
super(ReliefF_FeatureSelection, self).fit(X, y)

# extract feature subset from ReliefF
feature_subset = self.top_features_[: self.n_features_to_select]

# set `support_` vector
_, p = np.shape(X)
self.support_ = np.zeros(p, dtype=bool)
self.support_[feature_subset] = True


@hydra.main(config_path="conf", config_name="my_config")
def main(cfg: PipelineConfig) -> None:
run_pipeline(cfg)


if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions examples/algorithm-stability-yaml/conf/dataset/synclf_hard.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: Synclf hard
task: classification
domain: synthetic
group: Synclf
adapter:
_target_: sklearn.datasets.make_classification
class_sep: 0.8
n_classes: 3
n_clusters_per_class: 3
n_features: 50
n_informative: 4
n_redundant: 0
n_repeated: 0
n_samples: 1000
random_state: 0
shuffle: false
feature_importances:
X[:, 0:4]: 1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# @package metrics
ranking_scores:
_target_: benchmark.StabilityNogueira
11 changes: 11 additions & 0 deletions examples/algorithm-stability-yaml/conf/my_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
defaults:
- base_pipeline_config
- _self_
- override dataset: synclf_hard
- override validator: knn
- override /callbacks:
- to_sql
- override /metrics:
- stability_nogueira

n_bootstraps: 10
11 changes: 11 additions & 0 deletions examples/algorithm-stability-yaml/conf/ranker/boruta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: Boruta
estimator:
_target_: boruta.boruta_py.BorutaPy
estimator:
_target_: sklearn.ensemble.RandomForestClassifier
n_estimators: auto
_estimator_type: classifier
multioutput: false
estimates_feature_importances: false
estimates_feature_support: true
estimates_feature_ranking: true
7 changes: 7 additions & 0 deletions examples/algorithm-stability-yaml/conf/ranker/relieff.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: ReliefF
estimator:
_target_: benchmark.ReliefF_FeatureSelection
n_features_to_select: 10 # select best 10 features in feature subset.
_estimator_type: classifier
estimates_feature_importances: true
estimates_feature_support: true
6 changes: 6 additions & 0 deletions examples/algorithm-stability-yaml/conf/validator/knn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: k-NN
estimator:
_target_: sklearn.neighbors.KNeighborsClassifier
_estimator_type: classifier
multioutput: false
estimates_target: true
6 changes: 3 additions & 3 deletions fseval/callbacks/to_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from typing import Dict

import pandas as pd
from fseval.config.callbacks.to_sql import ToSQLCallback
from fseval.types import TerminalColor
from omegaconf import MISSING, DictConfig
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool

from fseval.config.callbacks.to_sql import ToSQLCallback
from fseval.types import TerminalColor

from ._base_export_callback import BaseExportCallback

Expand Down
4 changes: 2 additions & 2 deletions fseval/pipelines/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

import numpy as np
import pandas as pd
from humanfriendly import format_timespan

from fseval.pipeline.estimator import Estimator
from fseval.types import AbstractEstimator, Callback, TerminalColor
from humanfriendly import format_timespan
from sqlalchemy.engine import Engine


@dataclass
Expand Down
5 changes: 3 additions & 2 deletions fseval/pipelines/rank_and_validate/_support_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,11 @@ def score(self, X, y, **kwargs) -> Union[Dict, pd.DataFrame, np.generic, None]:
scores = pd.DataFrame([scores_dict])

# add custom metrics
X_, y_ = self._prepare_data(X, y)

for metric_name, metric_class in self.metrics.items():
X, y = self._prepare_data(X, y)
scores_metric = metric_class.score_support( # type: ignore
scores, self.validator, X, y, self.callbacks
scores, self.validator, X_, y_, self.callbacks
) # type: ignore

if scores_metric is not None:
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import tempfile

import pytest
from hydra.conf import ConfigStore

from fseval.config import EstimatorConfig, PipelineConfig
from fseval.main import run_pipeline
from fseval.types import IncompatibilityError
from fseval.utils.hydra_utils import get_config
from hydra.conf import ConfigStore
from hydra.errors import InstantiationException
Expand Down
1 change: 0 additions & 1 deletion website/docs/_recipes/algorithm-stability.md

This file was deleted.

3 changes: 0 additions & 3 deletions website/docs/_recipes/running-on-aws.md

This file was deleted.

1 change: 0 additions & 1 deletion website/docs/_recipes/running-on-slurm.md

This file was deleted.

8 changes: 8 additions & 0 deletions website/docs/quick-start.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ We can now decide how to export the results. We can upload our results to a live
sql_con=sqlite:////Users/dunnkers/Downloads/results.sqlite # any well-defined database URL
```

:::note Relative vs absolute paths

If you define a _relative_ database URL, like `sql_con=sqlite:///./results.sqlite`, the results will be saved right where Hydra stores its individual run files. In other words, multiple `.sqlite` files are stored in the `./multirun` subfolders.

To prevent this, and store all results in 1 `.sqlite` file, use an **absolute** path, like above. But preferably, you are using a proper running database - see the recipes for more instructions on this.

:::

We are now ready to run an experiment. In a terminal, `cd` into the unzipped example directory and run the following:
```shell
python benchmark.py --multirun ranker='glob(*)' +callbacks.to_sql.url=$sql_con
Expand Down
Loading