Merge pull request #416 from v715/parallelize-ObliqueSplitter

Parallelize Gini impurity calculation for `ObliqueSplitter`
neurodata · Jan 13, 2021 · 1ffe82c · 1ffe82c
2 parents b3bd24b + b41e31e
commit 1ffe82c
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 33 deletions.
diff --git a/proglearn/tests/test_transformer.py b/proglearn/tests/test_transformer.py
@@ -46,6 +46,8 @@ def test_sample_projmat(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(100, 100)
         y = np.zeros(100)
 
@@ -62,7 +64,7 @@ def test_sample_projmat(self):
         n_sample_inds = [10, 20, 40, 60, 80]
 
         for pd in proj_dims:
-            splitter = ObliqueSplitter(X, y, pd, density, random_state)
+            splitter = ObliqueSplitter(X, y, pd, density, random_state, workers)
 
             for i in range(len(n_sample_inds)):
                 si = sample_inds[i]
@@ -77,13 +79,15 @@ def test_score(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(11, 11)
 
         density = 0.5
         proj_dims = 5
 
         y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
+        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)
 
         score = splitter.score(y, 6)
         assert 0 == score
@@ -96,6 +100,8 @@ def test_impurity(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(100, 100)
 
         density = 0.5
@@ -106,7 +112,7 @@ def test_impurity(self):
             for j in range(10):
                 y[10 * i + j] = i
 
-        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
+        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)
 
         # Impurity of one thing should be 0
         impurity = splitter.impurity([0])
@@ -129,6 +135,8 @@ def test_split(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(100, 100)
 
         density = 0.5
@@ -139,7 +147,7 @@ def test_split(self):
             for j in range(10):
                 y[10 * i + j] = i
 
-        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
+        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)
 
         split_info = splitter.split(np.array([i for i in range(100)]))
 

diff --git a/proglearn/transformers.py b/proglearn/transformers.py
@@ -2,20 +2,14 @@
 Main Author: Will LeVine 
 Corresponding Email: [email protected]
 """
+import keras
 import numpy as np
-
-from sklearn.tree import DecisionTreeClassifier
+from itertools import product
+from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
 from sklearn.random_projection import SparseRandomProjection
-
-
-from sklearn.utils.validation import (
-    check_X_y,
-    check_array,
-    check_is_fitted,
-)
-
-import keras as keras
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
 from .base import BaseTransformer
 
@@ -347,6 +341,9 @@ class ObliqueSplitter:
         Ratio of non-zero component in the random projection matrix in the range '(0, 1]'.
     random_state : int
         Controls the pseudo random number generator used to generate the projection matrix.
+    workers : int
+        The number of cores to parallelize the calculation of Gini impurity.
+        Supply -1 to use all cores available to the Process.
 
     Methods
     -------
@@ -357,13 +354,15 @@ class ObliqueSplitter:
         node.
     score(y_sort, t)
         Finds the Gini impurity for a split.
+    _score(self, proj_X, y_sample, i, j)
+        Handles array indexing before calculating Gini impurity.
     impurity(idx)
         Finds the impurity for a certain set of samples.
     split(sample_inds)
         Determines the best possible split for the given set of samples.
     """
 
-    def __init__(self, X, y, proj_dims, density, random_state):
+    def __init__(self, X, y, proj_dims, density, random_state, workers):
 
         self.X = X
         self.y = y
@@ -377,6 +376,7 @@ def __init__(self, X, y, proj_dims, density, random_state):
         self.proj_dims = proj_dims
         self.density = density
         self.random_state = random_state
+        self.workers = workers
 
     def sample_proj_mat(self, sample_inds):
         """
@@ -473,6 +473,38 @@ def score(self, y_sort, t):
         ) * right_gini
         return gini
 
+    def _score(self, proj_X, y_sample, i, j):
+        """
+        Handles array indexing before calculating Gini impurity
+
+        Parameters
+        ----------
+        proj_X : {ndarray, sparse matrix} of shape (n_samples, self.proj_dims)
+            Projected input data matrix.
+        y_sample : array of shape [n_samples]
+            Labels for sample of data.
+        i : float
+            The threshold determining where to split y_sort.
+        j : float
+            The projection dimension to consider.
+
+        Returns
+        -------
+        gini : float
+            The Gini impurity of the split.
+        i : float
+            The threshold determining where to split y_sort.
+        j : float
+            The projection dimension to consider.
+        """
+        # Sort labels by the jth feature
+        idx = np.argsort(proj_X[:, j])
+        y_sort = y_sample[idx]
+
+        gini = self.score(y_sort, i)
+
+        return gini, i, j
+
     # Returns impurity for a group of examples
     # expects idx not None
     def impurity(self, idx):
@@ -503,12 +535,9 @@ def impurity(self, idx):
         return 1 - gini
 
     # Finds the best split
-    # This needs to be parallelized; its a major bottleneck
     def split(self, sample_inds):
         """
         Finds the optimal split for a set of samples.
-        Note that the code for this method needs to be parallelized. This is a major
-        bottleneck in integration with scikit-learn.
 
         Parameters
         ----------
@@ -533,17 +562,13 @@ def split(self, sample_inds):
         Q[0, :] = node_impurity
         Q[-1, :] = node_impurity
 
-        # Loop through projected features and examples to find best split
-        # This can be parallelized for sure
-        for j in range(self.proj_dims):
-
-            # Sort labels by the jth feature
-            idx = np.argsort(proj_X[:, j])
-            y_sort = y_sample[idx]
-
-            Q[1:-1, j] = np.array(
-                [self.score(y_sort, i) for i in range(1, n_samples - 1)]
-            )
+        # Loop through examples and projected features to calculate split scores
+        split_iterator = product(range(1, n_samples - 1), range(self.proj_dims))
+        scores = Parallel(n_jobs=self.workers)(
+            delayed(self._score)(proj_X, y_sample, i, j) for i, j in split_iterator
+        )
+        for gini, i, j in scores:
+            Q[i, j] = gini
 
         # Identify best split feature, minimum gini impurity
         best_split_ind = np.argmin(Q)
@@ -916,7 +941,7 @@ def predict(self, X):
         predictions = np.zeros(X.shape[0])
         for i in range(X.shape[0]):
             cur = self.nodes[0]
-            while not cur is None and not cur.is_leaf:
+            while cur is not None and not cur.is_leaf:
                 proj_X = cur.proj_mat.transform(X)
                 if proj_X[i, cur.feature] < cur.threshold:
                     id = cur.left_child
@@ -957,6 +982,9 @@ class ObliqueTreeClassifier(BaseEstimator):
         The feature combinations to use for the oblique split.
     density : float
         Density estimate.
+    workers : int, optional (default: -1)
+        The number of cores to parallelize the calculation of Gini impurity.
+        Supply -1 to use all cores available to the Process.
 
     Methods
     -------
@@ -990,7 +1018,8 @@ def __init__(
         # ccp_alpha=0.0,
         # New args
         feature_combinations=1.5,
-        density=0.5
+        density=0.5,
+        workers=-1,
     ):
 
         # self.criterion=criterion
@@ -1008,6 +1037,7 @@ def __init__(
 
         self.feature_combinations = feature_combinations
         self.density = density
+        self.workers = workers
 
     def fit(self, X, y):
         """
@@ -1028,7 +1058,7 @@ def fit(self, X, y):
 
         self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations)
         splitter = ObliqueSplitter(
-            X, y, self.proj_dims, self.density, self.random_state
+            X, y, self.proj_dims, self.density, self.random_state, self.workers
         )
 
         self.tree = ObliqueTree(