Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
rosecers committed May 30, 2023
1 parent d518ebf commit d5f020e
Show file tree
Hide file tree
Showing 12 changed files with 238 additions and 176 deletions.
111 changes: 63 additions & 48 deletions src/skmatter/_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
selection_type : str, {'feature', 'sample'}
whether to choose a subset of columns ('feature') or rows ('sample').
Stored in :py:attr:`self._axis_name` (as text) and :py:attr:`self._axis`
Stored in :py:attr:`self.axis_name` (as text) and :py:attr:`self.axis`
(as 0 or 1 for 'sample' or 'feature', respectively).
n_to_select : int or float, default=None
Expand All @@ -63,14 +63,16 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
progress_bar: bool, default=False
option to use `tqdm <https://tqdm.github.io/>`_
progress bar to monitor selections. Stored in
:py:attr:`self.report_progress`.
:py:attr:`self.report_progress_`.
full : bool, default=False
In the case that all non-redundant selections are exhausted, choose
randomly from the remaining features. Stored in :py:attr:`self.full`.
random_state: int or RandomState instance, default=0
axis: [0,1] axis over which we are doing selection
Attributes
----------
n_selected_ : int
Expand All @@ -93,12 +95,27 @@ def __init__(
progress_bar=False,
full=False,
random_state=0,
axis=None,
):
self.selection_type = selection_type
if selection_type is not None and axis is None:
self.selection_type = selection_type
if selection_type == "feature":
self.axis = 1
elif selection_type == "sample":
self.axis = 0
else:
raise ValueError("Only feature and sample selection supported.")
elif axis is not None:
if axis in [0, 1]:
self.axis = axis
self.selection_type = ["sample", "feature"][axis]
else:
raise ValueError(
"Only feature (axis=1) and sample (axis=0) selection supported."
)
self.n_to_select = n_to_select
self.score_threshold = score_threshold
self.score_threshold_type = score_threshold_type
self._first_score = None
if self.score_threshold_type not in ["relative", "absolute"]:
raise ValueError(
"invalid score_threshold_type, expected one of 'relative' or 'absolute'"
Expand Down Expand Up @@ -128,28 +145,21 @@ def fit(self, X, y=None, warm_start=False):
"""
tags = self._get_tags()

if self.selection_type == "feature":
self._axis = 1
elif self.selection_type == "sample":
self._axis = 0
else:
raise ValueError("Only feature and sample selection supported.")

if self.full and self.score_threshold is not None:
raise ValueError(
"You cannot specify both `score_threshold` and `full=True`."
)

if self.progress_bar is True:
self.report_progress = get_progress_bar()
self.report_progress_ = get_progress_bar()
elif self.progress_bar is False:
self.report_progress = no_progress_bar
self.report_progress_ = no_progress_bar

params = dict(
accept_sparse="csc",
force_all_finite=not tags.get("allow_nan", True),
)
if self._axis == 1:
if self.axis == 1:
params["ensure_min_features"] = 2
else:
params["ensure_min_samples"] = 2
Expand All @@ -166,7 +176,8 @@ def fit(self, X, y=None, warm_start=False):
else:
X = check_array(X, **params)

n_to_select_from = X.shape[self._axis]
n_to_select_from = X.shape[self.axis]
self.n_samples_in_, self.n_features_in_ = X.shape

error_msg = (
"n_to_select must be either None, an "
Expand Down Expand Up @@ -203,7 +214,7 @@ def fit(self, X, y=None, warm_start=False):

n_iterations -= self.n_selected_

for n in self.report_progress(range(n_iterations)):
for n in self.report_progress_(range(n_iterations)):
new_idx = self._get_best_new_selection(self.score, X, y)
if new_idx is not None:
self._update_post_selection(X, y, new_idx)
Expand All @@ -214,7 +225,7 @@ def fit(self, X, y=None, warm_start=False):
stacklevel=1,
)
self.X_selected_ = np.take(
self.X_selected_, np.arange(self.n_selected_), axis=self._axis
self.X_selected_, np.arange(self.n_selected_), axis=self.axis
)

if hasattr(self, "y_selected_"):
Expand Down Expand Up @@ -255,13 +266,16 @@ def transform(self, X, y=None):
accept_sparse="csr",
force_all_finite=not _safe_tags(self, key="allow_nan"),
reset=False,
ensure_2d=self._axis,
ensure_2d=self.axis,
)

if len(mask) != X.shape[self._axis]:
raise ValueError("X has a different shape than during fitting.")
if len(X.shape) == 1:
if self.axis == 0:
X = X.reshape(-1, 1)
else:
X = X.reshape(1, -1)

if self._axis == 1:
if self.axis == 1:
return X[:, safe_mask(X, mask)]
else:
return X[safe_mask(X, mask)]
Expand Down Expand Up @@ -326,13 +340,14 @@ def _init_greedy_search(self, X, y, n_to_select):
"""Initializes the search. Prepares an array to store the selected features."""

self.n_selected_ = 0
self.first_score_ = None

sel_shape = list(X.shape)
sel_shape[self._axis] = n_to_select
sel_shape[self.axis] = n_to_select

self.X_selected_ = np.zeros(sel_shape, float)

if y is not None and self._axis == 0:
if y is not None and self.axis == 0:
self.y_selected_ = np.zeros(
(n_to_select, y.reshape(y.shape[0], -1).shape[1]), float
)
Expand All @@ -342,7 +357,7 @@ def _continue_greedy_search(self, X, y, n_to_select):
"""Continues the search. Prepares an array to store the selected features."""

n_pad = [(0, 0), (0, 0)]
n_pad[self._axis] = (0, n_to_select - self.n_selected_)
n_pad[self.axis] = (0, n_to_select - self.n_selected_)

self.X_selected_ = np.pad(
self.X_selected_,
Expand All @@ -368,15 +383,15 @@ def _get_best_new_selection(self, scorer, X, y):

max_score_idx = np.argmax(scores)
if self.score_threshold is not None:
if self._first_score is None:
self._first_score = scores[max_score_idx]
if self.first_score_ is None:
self.first_score_ = scores[max_score_idx]

if self.score_threshold_type == "absolute":
if scores[max_score_idx] < self.score_threshold:
return None

if self.score_threshold_type == "relative":
if scores[max_score_idx] / self._first_score < self.score_threshold:
if scores[max_score_idx] / self.first_score_ < self.score_threshold:
return None

return max_score_idx
Expand All @@ -386,13 +401,13 @@ def _update_post_selection(self, X, y, last_selected):
Saves the most recently selected feature and increments the feature counter
"""

if self._axis == 1:
if self.axis == 1:
self.X_selected_[:, self.n_selected_] = np.take(
X, last_selected, axis=self._axis
X, last_selected, axis=self.axis
)
else:
self.X_selected_[self.n_selected_] = np.take(
X, last_selected, axis=self._axis
X, last_selected, axis=self.axis
)

if hasattr(self, "y_selected_"):
Expand Down Expand Up @@ -421,7 +436,7 @@ def _get_support_mask(self):

def _postprocess(self, X, y):
"""Post-process X and / or y when selection is finished"""
self.support_ = np.full(X.shape[self._axis], False)
self.support_ = np.full(X.shape[self.axis], False)
self.support_[self.selected_idx_] = True

def _more_tags(self):
Expand Down Expand Up @@ -532,7 +547,7 @@ def _continue_greedy_search(self, X, y, n_to_select):

for c in self.selected_idx_:
if self.recompute_every != 0 and (
np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis))
np.linalg.norm(np.take(self.X_current_, [c], axis=self.axis))
> self.tolerance
):
self._orthogonalize(last_selected=c)
Expand Down Expand Up @@ -576,7 +591,7 @@ def _compute_pi(self, X, y=None):
:math:`\\pi` importance for the given samples or features
"""

if self._axis == 0:
if self.axis == 0:
U, _, _ = scipy.sparse.linalg.svds(X, k=self.k, return_singular_vectors="u")
U = np.real(U)
new_pi = (U[:, : self.k] ** 2.0).sum(axis=1)
Expand Down Expand Up @@ -605,7 +620,7 @@ def _update_post_selection(self, X, y, last_selected):
self.pi_[last_selected] = 0.0

def _orthogonalize(self, last_selected):
if self._axis == 1:
if self.axis == 1:
self.X_current_ = X_orthogonalizer(
x1=self.X_current_, c=last_selected, tol=self.tolerance
)
Expand Down Expand Up @@ -732,7 +747,7 @@ def _continue_greedy_search(self, X, y, n_to_select):

for c in self.selected_idx_:
if self.recompute_every != 0 and (
np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis))
np.linalg.norm(np.take(self.X_current_, [c], axis=self.axis))
> self.tolerance
):
self._orthogonalize(last_selected=c)
Expand Down Expand Up @@ -800,7 +815,7 @@ def _compute_pi(self, X, y=None):
:math:`\pi` importance for the given samples or features
"""

if self._axis == 0:
if self.axis == 0:
pcovr_distance = pcovr_kernel(
self.mixing,
X,
Expand All @@ -825,7 +840,7 @@ def _compute_pi(self, X, y=None):
return pi

def _orthogonalize(self, last_selected):
if self._axis == 1:
if self.axis == 1:
self.X_current_ = X_orthogonalizer(
x1=self.X_current_, c=last_selected, tol=self.tolerance
)
Expand All @@ -834,7 +849,7 @@ def _orthogonalize(self, last_selected):
x1=self.X_current_.T, c=last_selected, tol=self.tolerance
).T
if self.y_current_ is not None:
if self._axis == 1:
if self.axis == 1:
self.y_current_ = Y_feature_orthogonalizer(
self.y_current_, X=self.X_selected_, tol=self.tolerance
)
Expand Down Expand Up @@ -960,13 +975,13 @@ def _init_greedy_search(self, X, y, n_to_select):

super()._init_greedy_search(X, y, n_to_select)

self.norms_ = (X**2).sum(axis=abs(self._axis - 1))
self.haussdorf_ = np.full(X.shape[self._axis], np.inf)
self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf)
self.norms_ = (X**2).sum(axis=abs(self.axis - 1))
self.haussdorf_ = np.full(X.shape[self.axis], np.inf)
self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf)

if self.initialize == "random":
random_state = check_random_state(self.random_state)
initialize = random_state.randint(X.shape[self._axis])
initialize = random_state.randint(X.shape[self.axis])
self.selected_idx_[0] = initialize
self._update_post_selection(X, y, self.selected_idx_[0])
elif isinstance(self.initialize, numbers.Integral):
Expand All @@ -986,7 +1001,7 @@ def _update_haussdorf(self, X, y, last_selected):
self.haussdorf_at_select_[last_selected] = self.haussdorf_[last_selected]

# distances of all points to the new point
if self._axis == 1:
if self.axis == 1:
new_dist = (
self.norms_ + self.norms_[last_selected] - 2 * X[:, last_selected].T @ X
)
Expand Down Expand Up @@ -1116,7 +1131,7 @@ def _init_greedy_search(self, X, y, n_to_select):

super()._init_greedy_search(X, y, n_to_select)

if self._axis == 1:
if self.axis == 1:
self.pcovr_distance_ = pcovr_covariance(mixing=self.mixing, X=X, Y=y)
else:
self.pcovr_distance_ = pcovr_kernel(mixing=self.mixing, X=X, Y=y)
Expand All @@ -1125,15 +1140,15 @@ def _init_greedy_search(self, X, y, n_to_select):

if self.initialize == "random":
random_state = check_random_state(self.random_state)
initialize = random_state.randint(X.shape[self._axis])
initialize = random_state.randint(X.shape[self.axis])
elif isinstance(self.initialize, numbers.Integral):
initialize = self.initialize
else:
raise ValueError("Invalid value of the initialize parameter")

self.selected_idx_[0] = initialize
self.haussdorf_ = np.full(X.shape[self._axis], np.inf)
self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf)
self.haussdorf_ = np.full(X.shape[self.axis], np.inf)
self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf)
self._update_post_selection(X, y, self.selected_idx_[0])

def _update_haussdorf(self, X, y, last_selected):
Expand All @@ -1143,7 +1158,7 @@ def _update_haussdorf(self, X, y, last_selected):
new_dist = (
self.norms_
+ self.norms_[last_selected]
- 2 * np.take(self.pcovr_distance_, last_selected, axis=self._axis)
- 2 * np.take(self.pcovr_distance_, last_selected, axis=self.axis)
)

# update in-place the Haussdorf distance list
Expand Down
Loading

0 comments on commit d5f020e

Please sign in to comment.