From ebee6cd6e19cfc39e50a460c4322296ceba38d13 Mon Sep 17 00:00:00 2001 From: Bora Uyar Date: Thu, 22 Feb 2024 10:17:55 +0100 Subject: [PATCH] use correlation threshold instead of boolean --- flexynesis/feature_selection.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/flexynesis/feature_selection.py b/flexynesis/feature_selection.py index 6bd4220..c77e496 100644 --- a/flexynesis/feature_selection.py +++ b/flexynesis/feature_selection.py @@ -54,7 +54,7 @@ def laplacian_score(X, k=5, t=None): return np.array(scores) -def remove_redundant_features(X, laplacian_scores, threshold=0.8, topN=None): +def remove_redundant_features(X, laplacian_scores, threshold, topN=None): """ Selects features based on Laplacian scores while avoiding highly correlated features. @@ -108,7 +108,7 @@ def remove_redundant_features(X, laplacian_scores, threshold=0.8, topN=None): return selected_features -def filter_by_laplacian(X, layer, k=5, t=None, topN=100, remove_redundant=True, threshold=0.8): +def filter_by_laplacian(X, layer, k=5, t=None, topN=100, correlation_threshold=0.9): """ Given a data matrix, compute laplacian score for each feature and return a filtered data matrix based on top laplacian scores. @@ -143,10 +143,11 @@ def filter_by_laplacian(X, layer, k=5, t=None, topN=100, remove_redundant=True, topN_extended = min(topN_extended, X.shape[1]) # Ensure we don't exceed the number of features selected_features = sorted_indices[:topN_extended] - if remove_redundant: + if correlation_threshold < 1: # Remove redundancy from topN + 10% features selected_features = remove_redundant_features(X[X.columns[selected_features]].values, - scores[selected_features], threshold, topN) + scores[selected_features], correlation_threshold, + topN) # Prune down to topN features selected_features = selected_features[:topN]