-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #416 from v715/parallelize-ObliqueSplitter
Parallelize Gini impurity calculation for `ObliqueSplitter`
- Loading branch information
Showing
2 changed files
with
71 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,20 +2,14 @@ | |
Main Author: Will LeVine | ||
Corresponding Email: [email protected] | ||
""" | ||
import keras | ||
import numpy as np | ||
|
||
from sklearn.tree import DecisionTreeClassifier | ||
from itertools import product | ||
from joblib import Parallel, delayed | ||
from sklearn.base import BaseEstimator | ||
from sklearn.random_projection import SparseRandomProjection | ||
|
||
|
||
from sklearn.utils.validation import ( | ||
check_X_y, | ||
check_array, | ||
check_is_fitted, | ||
) | ||
|
||
import keras as keras | ||
from sklearn.tree import DecisionTreeClassifier | ||
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y | ||
|
||
from .base import BaseTransformer | ||
|
||
|
@@ -347,6 +341,9 @@ class ObliqueSplitter: | |
Ratio of non-zero component in the random projection matrix in the range '(0, 1]'. | ||
random_state : int | ||
Controls the pseudo random number generator used to generate the projection matrix. | ||
workers : int | ||
The number of cores to parallelize the calculation of Gini impurity. | ||
Supply -1 to use all cores available to the Process. | ||
Methods | ||
------- | ||
|
@@ -357,13 +354,15 @@ class ObliqueSplitter: | |
node. | ||
score(y_sort, t) | ||
Finds the Gini impurity for a split. | ||
_score(self, proj_X, y_sample, i, j) | ||
Handles array indexing before calculating Gini impurity. | ||
impurity(idx) | ||
Finds the impurity for a certain set of samples. | ||
split(sample_inds) | ||
Determines the best possible split for the given set of samples. | ||
""" | ||
|
||
def __init__(self, X, y, proj_dims, density, random_state): | ||
def __init__(self, X, y, proj_dims, density, random_state, workers): | ||
|
||
self.X = X | ||
self.y = y | ||
|
@@ -377,6 +376,7 @@ def __init__(self, X, y, proj_dims, density, random_state): | |
self.proj_dims = proj_dims | ||
self.density = density | ||
self.random_state = random_state | ||
self.workers = workers | ||
|
||
def sample_proj_mat(self, sample_inds): | ||
""" | ||
|
@@ -473,6 +473,38 @@ def score(self, y_sort, t): | |
) * right_gini | ||
return gini | ||
|
||
def _score(self, proj_X, y_sample, i, j): | ||
""" | ||
Handles array indexing before calculating Gini impurity | ||
Parameters | ||
---------- | ||
proj_X : {ndarray, sparse matrix} of shape (n_samples, self.proj_dims) | ||
Projected input data matrix. | ||
y_sample : array of shape [n_samples] | ||
Labels for sample of data. | ||
i : float | ||
The threshold determining where to split y_sort. | ||
j : float | ||
The projection dimension to consider. | ||
Returns | ||
------- | ||
gini : float | ||
The Gini impurity of the split. | ||
i : float | ||
The threshold determining where to split y_sort. | ||
j : float | ||
The projection dimension to consider. | ||
""" | ||
# Sort labels by the jth feature | ||
idx = np.argsort(proj_X[:, j]) | ||
y_sort = y_sample[idx] | ||
|
||
gini = self.score(y_sort, i) | ||
|
||
return gini, i, j | ||
|
||
# Returns impurity for a group of examples | ||
# expects idx not None | ||
def impurity(self, idx): | ||
|
@@ -503,12 +535,9 @@ def impurity(self, idx): | |
return 1 - gini | ||
|
||
# Finds the best split | ||
# This needs to be parallelized; its a major bottleneck | ||
def split(self, sample_inds): | ||
""" | ||
Finds the optimal split for a set of samples. | ||
Note that the code for this method needs to be parallelized. This is a major | ||
bottleneck in integration with scikit-learn. | ||
Parameters | ||
---------- | ||
|
@@ -533,17 +562,13 @@ def split(self, sample_inds): | |
Q[0, :] = node_impurity | ||
Q[-1, :] = node_impurity | ||
|
||
# Loop through projected features and examples to find best split | ||
# This can be parallelized for sure | ||
for j in range(self.proj_dims): | ||
|
||
# Sort labels by the jth feature | ||
idx = np.argsort(proj_X[:, j]) | ||
y_sort = y_sample[idx] | ||
|
||
Q[1:-1, j] = np.array( | ||
[self.score(y_sort, i) for i in range(1, n_samples - 1)] | ||
) | ||
# Loop through examples and projected features to calculate split scores | ||
split_iterator = product(range(1, n_samples - 1), range(self.proj_dims)) | ||
scores = Parallel(n_jobs=self.workers)( | ||
delayed(self._score)(proj_X, y_sample, i, j) for i, j in split_iterator | ||
) | ||
for gini, i, j in scores: | ||
Q[i, j] = gini | ||
|
||
# Identify best split feature, minimum gini impurity | ||
best_split_ind = np.argmin(Q) | ||
|
@@ -916,7 +941,7 @@ def predict(self, X): | |
predictions = np.zeros(X.shape[0]) | ||
for i in range(X.shape[0]): | ||
cur = self.nodes[0] | ||
while not cur is None and not cur.is_leaf: | ||
while cur is not None and not cur.is_leaf: | ||
proj_X = cur.proj_mat.transform(X) | ||
if proj_X[i, cur.feature] < cur.threshold: | ||
id = cur.left_child | ||
|
@@ -957,6 +982,9 @@ class ObliqueTreeClassifier(BaseEstimator): | |
The feature combinations to use for the oblique split. | ||
density : float | ||
Density estimate. | ||
workers : int, optional (default: -1) | ||
The number of cores to parallelize the calculation of Gini impurity. | ||
Supply -1 to use all cores available to the Process. | ||
Methods | ||
------- | ||
|
@@ -990,7 +1018,8 @@ def __init__( | |
# ccp_alpha=0.0, | ||
# New args | ||
feature_combinations=1.5, | ||
density=0.5 | ||
density=0.5, | ||
workers=-1, | ||
): | ||
|
||
# self.criterion=criterion | ||
|
@@ -1008,6 +1037,7 @@ def __init__( | |
|
||
self.feature_combinations = feature_combinations | ||
self.density = density | ||
self.workers = workers | ||
|
||
def fit(self, X, y): | ||
""" | ||
|
@@ -1028,7 +1058,7 @@ def fit(self, X, y): | |
|
||
self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) | ||
splitter = ObliqueSplitter( | ||
X, y, self.proj_dims, self.density, self.random_state | ||
X, y, self.proj_dims, self.density, self.random_state, self.workers | ||
) | ||
|
||
self.tree = ObliqueTree( | ||
|