From 874230e0f39db5cc0252b266b369757106372daa Mon Sep 17 00:00:00 2001 From: v715 Date: Tue, 12 Jan 2021 23:20:40 -0500 Subject: [PATCH 1/5] Add wrapper for calculating Gini impurity --- proglearn/transformers.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/proglearn/transformers.py b/proglearn/transformers.py index 93546fc116..db4d6a8023 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -357,6 +357,8 @@ class ObliqueSplitter: node. score(y_sort, t) Finds the Gini impurity for a split. + _score(self, proj_X, y_sample, i, j) + Handles array indexing before calculating Gini impurity. impurity(idx) Finds the impurity for a certain set of samples. split(sample_inds) @@ -473,6 +475,38 @@ def score(self, y_sort, t): ) * right_gini return gini + def _score(self, proj_X, y_sample, i, j): + """ + Handles array indexing before calculating Gini impurity + + Parameters + ---------- + proj_X : {ndarray, sparse matrix} of shape (n_samples, self.proj_dims) + Projected input data matrix. + y_sample : array of shape [n_samples] + Labels for sample of data. + i : float + The threshold determining where to split y_sort. + j : float + The projection dimension to consider. + + Returns + ------- + gini : float + The Gini impurity of the split. + i : float + The threshold determining where to split y_sort. + j : float + The projection dimension to consider. + """ + # Sort labels by the jth feature + idx = np.argsort(proj_X[:, j]) + y_sort = y_sample[idx] + + gini = self.score(y_sort, i) + + return gini, i, j + # Returns impurity for a group of examples # expects idx not None def impurity(self, idx): From 95663670eff8020940bc9829f0678f4e5f202194 Mon Sep 17 00:00:00 2001 From: v715 Date: Tue, 12 Jan 2021 23:21:05 -0500 Subject: [PATCH 2/5] Parallelize Gini impurity calculation --- proglearn/transformers.py | 55 ++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/proglearn/transformers.py b/proglearn/transformers.py index db4d6a8023..bacbf4397d 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -2,20 +2,15 @@ Main Author: Will LeVine Corresponding Email: levinewill@icloud.com """ -import numpy as np +from itertools import product -from sklearn.tree import DecisionTreeClassifier +import keras as keras +import numpy as np +from joblib import Parallel, delayed from sklearn.base import BaseEstimator from sklearn.random_projection import SparseRandomProjection - - -from sklearn.utils.validation import ( - check_X_y, - check_array, - check_is_fitted, -) - -import keras as keras +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y from .base import BaseTransformer @@ -347,6 +342,9 @@ class ObliqueSplitter: Ratio of non-zero component in the random projection matrix in the range '(0, 1]'. random_state : int Controls the pseudo random number generator used to generate the projection matrix. + workers : int + The number of cores to parallelize the p-value computation over. + Supply -1 to use all cores available to the Process. Methods ------- @@ -365,7 +363,7 @@ class ObliqueSplitter: Determines the best possible split for the given set of samples. """ - def __init__(self, X, y, proj_dims, density, random_state): + def __init__(self, X, y, proj_dims, density, random_state, workers): self.X = X self.y = y @@ -379,6 +377,7 @@ def __init__(self, X, y, proj_dims, density, random_state): self.proj_dims = proj_dims self.density = density self.random_state = random_state + self.workers = workers def sample_proj_mat(self, sample_inds): """ @@ -537,12 +536,9 @@ def impurity(self, idx): return 1 - gini # Finds the best split - # This needs to be parallelized; its a major bottleneck def split(self, sample_inds): """ Finds the optimal split for a set of samples. - Note that the code for this method needs to be parallelized. This is a major - bottleneck in integration with scikit-learn. Parameters ---------- @@ -567,17 +563,13 @@ def split(self, sample_inds): Q[0, :] = node_impurity Q[-1, :] = node_impurity - # Loop through projected features and examples to find best split - # This can be parallelized for sure - for j in range(self.proj_dims): - - # Sort labels by the jth feature - idx = np.argsort(proj_X[:, j]) - y_sort = y_sample[idx] - - Q[1:-1, j] = np.array( - [self.score(y_sort, i) for i in range(1, n_samples - 1)] - ) + # Loop through examples and projected features to calculate split scores + split_iterator = product(range(1, n_samples - 1), range(self.proj_dims)) + scores = Parallel(n_jobs=-1)( + delayed(self._score)(proj_X, y_sample, i, j) for i, j in split_iterator + ) + for gini, i, j in scores: + Q[i, j] = gini # Identify best split feature, minimum gini impurity best_split_ind = np.argmin(Q) @@ -950,7 +942,7 @@ def predict(self, X): predictions = np.zeros(X.shape[0]) for i in range(X.shape[0]): cur = self.nodes[0] - while not cur is None and not cur.is_leaf: + while cur is not None and not cur.is_leaf: proj_X = cur.proj_mat.transform(X) if proj_X[i, cur.feature] < cur.threshold: id = cur.left_child @@ -991,6 +983,9 @@ class ObliqueTreeClassifier(BaseEstimator): The feature combinations to use for the oblique split. density : float Density estimate. + workers : int, optional (default: -1) + The number of cores to parallelize the p-value computation over. + Supply -1 to use all cores available to the Process. Methods ------- @@ -1024,7 +1019,8 @@ def __init__( # ccp_alpha=0.0, # New args feature_combinations=1.5, - density=0.5 + density=0.5, + workers=-1, ): # self.criterion=criterion @@ -1042,6 +1038,7 @@ def __init__( self.feature_combinations = feature_combinations self.density = density + self.workers = workers def fit(self, X, y): """ @@ -1062,7 +1059,7 @@ def fit(self, X, y): self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) splitter = ObliqueSplitter( - X, y, self.proj_dims, self.density, self.random_state + X, y, self.proj_dims, self.density, self.random_state, self.workers ) self.tree = ObliqueTree( From dc722d0ee2738829fb4107577e77e5cb3d1523a2 Mon Sep 17 00:00:00 2001 From: v715 Date: Tue, 12 Jan 2021 23:21:16 -0500 Subject: [PATCH 3/5] Add workers argument to tests --- proglearn/tests/test_transformer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/proglearn/tests/test_transformer.py b/proglearn/tests/test_transformer.py index a4de499ef3..ce1f1bb782 100644 --- a/proglearn/tests/test_transformer.py +++ b/proglearn/tests/test_transformer.py @@ -46,6 +46,8 @@ def test_sample_projmat(self): random_state = 0 rng.seed(random_state) + workers = -1 + X = rng.rand(100, 100) y = np.zeros(100) @@ -62,7 +64,7 @@ def test_sample_projmat(self): n_sample_inds = [10, 20, 40, 60, 80] for pd in proj_dims: - splitter = ObliqueSplitter(X, y, pd, density, random_state) + splitter = ObliqueSplitter(X, y, pd, density, random_state, workers) for i in range(len(n_sample_inds)): si = sample_inds[i] @@ -77,13 +79,15 @@ def test_score(self): random_state = 0 rng.seed(random_state) + workers = -1 + X = rng.rand(11, 11) density = 0.5 proj_dims = 5 y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers) score = splitter.score(y, 6) assert 0 == score @@ -96,6 +100,8 @@ def test_impurity(self): random_state = 0 rng.seed(random_state) + workers = -1 + X = rng.rand(100, 100) density = 0.5 @@ -106,7 +112,7 @@ def test_impurity(self): for j in range(10): y[10 * i + j] = i - splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers) # Impurity of one thing should be 0 impurity = splitter.impurity([0]) @@ -129,6 +135,8 @@ def test_split(self): random_state = 0 rng.seed(random_state) + workers = -1 + X = rng.rand(100, 100) density = 0.5 @@ -139,7 +147,7 @@ def test_split(self): for j in range(10): y[10 * i + j] = i - splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers) split_info = splitter.split(np.array([i for i in range(100)])) From 2b6836fe130661d6fe11e3835ab0ff9a3879c5bb Mon Sep 17 00:00:00 2001 From: v715 Date: Tue, 12 Jan 2021 23:40:02 -0500 Subject: [PATCH 4/5] Fix docstring for workers argument --- proglearn/transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proglearn/transformers.py b/proglearn/transformers.py index bacbf4397d..25baa6d3c1 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -343,7 +343,7 @@ class ObliqueSplitter: random_state : int Controls the pseudo random number generator used to generate the projection matrix. workers : int - The number of cores to parallelize the p-value computation over. + The number of cores to parallelize the calculation of Gini impurity. Supply -1 to use all cores available to the Process. Methods @@ -984,7 +984,7 @@ class ObliqueTreeClassifier(BaseEstimator): density : float Density estimate. workers : int, optional (default: -1) - The number of cores to parallelize the p-value computation over. + The number of cores to parallelize the calculation of Gini impurity. Supply -1 to use all cores available to the Process. Methods From b41e31e9bf3e57df2c37ec87d9ead1e0be806e29 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 13 Jan 2021 09:13:16 -0500 Subject: [PATCH 5/5] Use specified worker numbers for parallelization Co-Authored-By: v715 --- proglearn/transformers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/proglearn/transformers.py b/proglearn/transformers.py index 25baa6d3c1..b7f89a3960 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -2,10 +2,9 @@ Main Author: Will LeVine Corresponding Email: levinewill@icloud.com """ -from itertools import product - -import keras as keras +import keras import numpy as np +from itertools import product from joblib import Parallel, delayed from sklearn.base import BaseEstimator from sklearn.random_projection import SparseRandomProjection @@ -565,7 +564,7 @@ def split(self, sample_inds): # Loop through examples and projected features to calculate split scores split_iterator = product(range(1, n_samples - 1), range(self.proj_dims)) - scores = Parallel(n_jobs=-1)( + scores = Parallel(n_jobs=self.workers)( delayed(self._score)(proj_X, y_sample, i, j) for i, j in split_iterator ) for gini, i, j in scores: