Skip to content

Commit

Permalink
Merge pull request #416 from v715/parallelize-ObliqueSplitter
Browse files Browse the repository at this point in the history
Parallelize Gini impurity calculation for `ObliqueSplitter`
  • Loading branch information
PSSF23 authored Jan 13, 2021
2 parents b3bd24b + b41e31e commit 1ffe82c
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 33 deletions.
16 changes: 12 additions & 4 deletions proglearn/tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def test_sample_projmat(self):
random_state = 0
rng.seed(random_state)

workers = -1

X = rng.rand(100, 100)
y = np.zeros(100)

Expand All @@ -62,7 +64,7 @@ def test_sample_projmat(self):
n_sample_inds = [10, 20, 40, 60, 80]

for pd in proj_dims:
splitter = ObliqueSplitter(X, y, pd, density, random_state)
splitter = ObliqueSplitter(X, y, pd, density, random_state, workers)

for i in range(len(n_sample_inds)):
si = sample_inds[i]
Expand All @@ -77,13 +79,15 @@ def test_score(self):
random_state = 0
rng.seed(random_state)

workers = -1

X = rng.rand(11, 11)

density = 0.5
proj_dims = 5

y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)

score = splitter.score(y, 6)
assert 0 == score
Expand All @@ -96,6 +100,8 @@ def test_impurity(self):
random_state = 0
rng.seed(random_state)

workers = -1

X = rng.rand(100, 100)

density = 0.5
Expand All @@ -106,7 +112,7 @@ def test_impurity(self):
for j in range(10):
y[10 * i + j] = i

splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)

# Impurity of one thing should be 0
impurity = splitter.impurity([0])
Expand All @@ -129,6 +135,8 @@ def test_split(self):
random_state = 0
rng.seed(random_state)

workers = -1

X = rng.rand(100, 100)

density = 0.5
Expand All @@ -139,7 +147,7 @@ def test_split(self):
for j in range(10):
y[10 * i + j] = i

splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)

split_info = splitter.split(np.array([i for i in range(100)]))

Expand Down
88 changes: 59 additions & 29 deletions proglearn/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,14 @@
Main Author: Will LeVine
Corresponding Email: [email protected]
"""
import keras
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from itertools import product
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.random_projection import SparseRandomProjection


from sklearn.utils.validation import (
check_X_y,
check_array,
check_is_fitted,
)

import keras as keras
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

from .base import BaseTransformer

Expand Down Expand Up @@ -347,6 +341,9 @@ class ObliqueSplitter:
Ratio of non-zero component in the random projection matrix in the range '(0, 1]'.
random_state : int
Controls the pseudo random number generator used to generate the projection matrix.
workers : int
The number of cores to parallelize the calculation of Gini impurity.
Supply -1 to use all cores available to the Process.
Methods
-------
Expand All @@ -357,13 +354,15 @@ class ObliqueSplitter:
node.
score(y_sort, t)
Finds the Gini impurity for a split.
_score(self, proj_X, y_sample, i, j)
Handles array indexing before calculating Gini impurity.
impurity(idx)
Finds the impurity for a certain set of samples.
split(sample_inds)
Determines the best possible split for the given set of samples.
"""

def __init__(self, X, y, proj_dims, density, random_state):
def __init__(self, X, y, proj_dims, density, random_state, workers):

self.X = X
self.y = y
Expand All @@ -377,6 +376,7 @@ def __init__(self, X, y, proj_dims, density, random_state):
self.proj_dims = proj_dims
self.density = density
self.random_state = random_state
self.workers = workers

def sample_proj_mat(self, sample_inds):
"""
Expand Down Expand Up @@ -473,6 +473,38 @@ def score(self, y_sort, t):
) * right_gini
return gini

def _score(self, proj_X, y_sample, i, j):
"""
Handles array indexing before calculating Gini impurity
Parameters
----------
proj_X : {ndarray, sparse matrix} of shape (n_samples, self.proj_dims)
Projected input data matrix.
y_sample : array of shape [n_samples]
Labels for sample of data.
i : float
The threshold determining where to split y_sort.
j : float
The projection dimension to consider.
Returns
-------
gini : float
The Gini impurity of the split.
i : float
The threshold determining where to split y_sort.
j : float
The projection dimension to consider.
"""
# Sort labels by the jth feature
idx = np.argsort(proj_X[:, j])
y_sort = y_sample[idx]

gini = self.score(y_sort, i)

return gini, i, j

# Returns impurity for a group of examples
# expects idx not None
def impurity(self, idx):
Expand Down Expand Up @@ -503,12 +535,9 @@ def impurity(self, idx):
return 1 - gini

# Finds the best split
# This needs to be parallelized; its a major bottleneck
def split(self, sample_inds):
"""
Finds the optimal split for a set of samples.
Note that the code for this method needs to be parallelized. This is a major
bottleneck in integration with scikit-learn.
Parameters
----------
Expand All @@ -533,17 +562,13 @@ def split(self, sample_inds):
Q[0, :] = node_impurity
Q[-1, :] = node_impurity

# Loop through projected features and examples to find best split
# This can be parallelized for sure
for j in range(self.proj_dims):

# Sort labels by the jth feature
idx = np.argsort(proj_X[:, j])
y_sort = y_sample[idx]

Q[1:-1, j] = np.array(
[self.score(y_sort, i) for i in range(1, n_samples - 1)]
)
# Loop through examples and projected features to calculate split scores
split_iterator = product(range(1, n_samples - 1), range(self.proj_dims))
scores = Parallel(n_jobs=self.workers)(
delayed(self._score)(proj_X, y_sample, i, j) for i, j in split_iterator
)
for gini, i, j in scores:
Q[i, j] = gini

# Identify best split feature, minimum gini impurity
best_split_ind = np.argmin(Q)
Expand Down Expand Up @@ -916,7 +941,7 @@ def predict(self, X):
predictions = np.zeros(X.shape[0])
for i in range(X.shape[0]):
cur = self.nodes[0]
while not cur is None and not cur.is_leaf:
while cur is not None and not cur.is_leaf:
proj_X = cur.proj_mat.transform(X)
if proj_X[i, cur.feature] < cur.threshold:
id = cur.left_child
Expand Down Expand Up @@ -957,6 +982,9 @@ class ObliqueTreeClassifier(BaseEstimator):
The feature combinations to use for the oblique split.
density : float
Density estimate.
workers : int, optional (default: -1)
The number of cores to parallelize the calculation of Gini impurity.
Supply -1 to use all cores available to the Process.
Methods
-------
Expand Down Expand Up @@ -990,7 +1018,8 @@ def __init__(
# ccp_alpha=0.0,
# New args
feature_combinations=1.5,
density=0.5
density=0.5,
workers=-1,
):

# self.criterion=criterion
Expand All @@ -1008,6 +1037,7 @@ def __init__(

self.feature_combinations = feature_combinations
self.density = density
self.workers = workers

def fit(self, X, y):
"""
Expand All @@ -1028,7 +1058,7 @@ def fit(self, X, y):

self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations)
splitter = ObliqueSplitter(
X, y, self.proj_dims, self.density, self.random_state
X, y, self.proj_dims, self.density, self.random_state, self.workers
)

self.tree = ObliqueTree(
Expand Down

0 comments on commit 1ffe82c

Please sign in to comment.