From 874230e0f39db5cc0252b266b369757106372daa Mon Sep 17 00:00:00 2001
From: v715 <vgopala4@jhu.edu>
Date: Tue, 12 Jan 2021 23:20:40 -0500
Subject: [PATCH 1/5] Add wrapper for calculating Gini impurity

---
 proglearn/transformers.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/proglearn/transformers.py b/proglearn/transformers.py
index 93546fc116..db4d6a8023 100644
--- a/proglearn/transformers.py
+++ b/proglearn/transformers.py
@@ -357,6 +357,8 @@ class ObliqueSplitter:
         node.
     score(y_sort, t)
         Finds the Gini impurity for a split.
+    _score(self, proj_X, y_sample, i, j)
+        Handles array indexing before calculating Gini impurity.
     impurity(idx)
         Finds the impurity for a certain set of samples.
     split(sample_inds)
@@ -473,6 +475,38 @@ def score(self, y_sort, t):
         ) * right_gini
         return gini
 
+    def _score(self, proj_X, y_sample, i, j):
+        """
+        Handles array indexing before calculating Gini impurity
+
+        Parameters
+        ----------
+        proj_X : {ndarray, sparse matrix} of shape (n_samples, self.proj_dims)
+            Projected input data matrix.
+        y_sample : array of shape [n_samples]
+            Labels for sample of data.
+        i : float
+            The threshold determining where to split y_sort.
+        j : float
+            The projection dimension to consider.
+
+        Returns
+        -------
+        gini : float
+            The Gini impurity of the split.
+        i : float
+            The threshold determining where to split y_sort.
+        j : float
+            The projection dimension to consider.
+        """
+        # Sort labels by the jth feature
+        idx = np.argsort(proj_X[:, j])
+        y_sort = y_sample[idx]
+
+        gini = self.score(y_sort, i)
+
+        return gini, i, j
+
     # Returns impurity for a group of examples
     # expects idx not None
     def impurity(self, idx):

From 95663670eff8020940bc9829f0678f4e5f202194 Mon Sep 17 00:00:00 2001
From: v715 <vgopala4@jhu.edu>
Date: Tue, 12 Jan 2021 23:21:05 -0500
Subject: [PATCH 2/5] Parallelize Gini impurity calculation

---
 proglearn/transformers.py | 55 ++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/proglearn/transformers.py b/proglearn/transformers.py
index db4d6a8023..bacbf4397d 100644
--- a/proglearn/transformers.py
+++ b/proglearn/transformers.py
@@ -2,20 +2,15 @@
 Main Author: Will LeVine 
 Corresponding Email: levinewill@icloud.com
 """
-import numpy as np
+from itertools import product
 
-from sklearn.tree import DecisionTreeClassifier
+import keras as keras
+import numpy as np
+from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
 from sklearn.random_projection import SparseRandomProjection
-
-
-from sklearn.utils.validation import (
-    check_X_y,
-    check_array,
-    check_is_fitted,
-)
-
-import keras as keras
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
 from .base import BaseTransformer
 
@@ -347,6 +342,9 @@ class ObliqueSplitter:
         Ratio of non-zero component in the random projection matrix in the range '(0, 1]'.
     random_state : int
         Controls the pseudo random number generator used to generate the projection matrix.
+    workers : int
+        The number of cores to parallelize the p-value computation over.
+        Supply -1 to use all cores available to the Process.
 
     Methods
     -------
@@ -365,7 +363,7 @@ class ObliqueSplitter:
         Determines the best possible split for the given set of samples.
     """
 
-    def __init__(self, X, y, proj_dims, density, random_state):
+    def __init__(self, X, y, proj_dims, density, random_state, workers):
 
         self.X = X
         self.y = y
@@ -379,6 +377,7 @@ def __init__(self, X, y, proj_dims, density, random_state):
         self.proj_dims = proj_dims
         self.density = density
         self.random_state = random_state
+        self.workers = workers
 
     def sample_proj_mat(self, sample_inds):
         """
@@ -537,12 +536,9 @@ def impurity(self, idx):
         return 1 - gini
 
     # Finds the best split
-    # This needs to be parallelized; its a major bottleneck
     def split(self, sample_inds):
         """
         Finds the optimal split for a set of samples.
-        Note that the code for this method needs to be parallelized. This is a major
-        bottleneck in integration with scikit-learn.
 
         Parameters
         ----------
@@ -567,17 +563,13 @@ def split(self, sample_inds):
         Q[0, :] = node_impurity
         Q[-1, :] = node_impurity
 
-        # Loop through projected features and examples to find best split
-        # This can be parallelized for sure
-        for j in range(self.proj_dims):
-
-            # Sort labels by the jth feature
-            idx = np.argsort(proj_X[:, j])
-            y_sort = y_sample[idx]
-
-            Q[1:-1, j] = np.array(
-                [self.score(y_sort, i) for i in range(1, n_samples - 1)]
-            )
+        # Loop through examples and projected features to calculate split scores
+        split_iterator = product(range(1, n_samples - 1), range(self.proj_dims))
+        scores = Parallel(n_jobs=-1)(
+            delayed(self._score)(proj_X, y_sample, i, j) for i, j in split_iterator
+        )
+        for gini, i, j in scores:
+            Q[i, j] = gini
 
         # Identify best split feature, minimum gini impurity
         best_split_ind = np.argmin(Q)
@@ -950,7 +942,7 @@ def predict(self, X):
         predictions = np.zeros(X.shape[0])
         for i in range(X.shape[0]):
             cur = self.nodes[0]
-            while not cur is None and not cur.is_leaf:
+            while cur is not None and not cur.is_leaf:
                 proj_X = cur.proj_mat.transform(X)
                 if proj_X[i, cur.feature] < cur.threshold:
                     id = cur.left_child
@@ -991,6 +983,9 @@ class ObliqueTreeClassifier(BaseEstimator):
         The feature combinations to use for the oblique split.
     density : float
         Density estimate.
+    workers : int, optional (default: -1)
+        The number of cores to parallelize the p-value computation over.
+        Supply -1 to use all cores available to the Process.
 
     Methods
     -------
@@ -1024,7 +1019,8 @@ def __init__(
         # ccp_alpha=0.0,
         # New args
         feature_combinations=1.5,
-        density=0.5
+        density=0.5,
+        workers=-1,
     ):
 
         # self.criterion=criterion
@@ -1042,6 +1038,7 @@ def __init__(
 
         self.feature_combinations = feature_combinations
         self.density = density
+        self.workers = workers
 
     def fit(self, X, y):
         """
@@ -1062,7 +1059,7 @@ def fit(self, X, y):
 
         self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations)
         splitter = ObliqueSplitter(
-            X, y, self.proj_dims, self.density, self.random_state
+            X, y, self.proj_dims, self.density, self.random_state, self.workers
         )
 
         self.tree = ObliqueTree(

From dc722d0ee2738829fb4107577e77e5cb3d1523a2 Mon Sep 17 00:00:00 2001
From: v715 <vgopala4@jhu.edu>
Date: Tue, 12 Jan 2021 23:21:16 -0500
Subject: [PATCH 3/5] Add workers argument to tests

---
 proglearn/tests/test_transformer.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/proglearn/tests/test_transformer.py b/proglearn/tests/test_transformer.py
index a4de499ef3..ce1f1bb782 100644
--- a/proglearn/tests/test_transformer.py
+++ b/proglearn/tests/test_transformer.py
@@ -46,6 +46,8 @@ def test_sample_projmat(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(100, 100)
         y = np.zeros(100)
 
@@ -62,7 +64,7 @@ def test_sample_projmat(self):
         n_sample_inds = [10, 20, 40, 60, 80]
 
         for pd in proj_dims:
-            splitter = ObliqueSplitter(X, y, pd, density, random_state)
+            splitter = ObliqueSplitter(X, y, pd, density, random_state, workers)
 
             for i in range(len(n_sample_inds)):
                 si = sample_inds[i]
@@ -77,13 +79,15 @@ def test_score(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(11, 11)
 
         density = 0.5
         proj_dims = 5
 
         y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
+        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)
 
         score = splitter.score(y, 6)
         assert 0 == score
@@ -96,6 +100,8 @@ def test_impurity(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(100, 100)
 
         density = 0.5
@@ -106,7 +112,7 @@ def test_impurity(self):
             for j in range(10):
                 y[10 * i + j] = i
 
-        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
+        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)
 
         # Impurity of one thing should be 0
         impurity = splitter.impurity([0])
@@ -129,6 +135,8 @@ def test_split(self):
         random_state = 0
         rng.seed(random_state)
 
+        workers = -1
+
         X = rng.rand(100, 100)
 
         density = 0.5
@@ -139,7 +147,7 @@ def test_split(self):
             for j in range(10):
                 y[10 * i + j] = i
 
-        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state)
+        splitter = ObliqueSplitter(X, y, proj_dims, density, random_state, workers)
 
         split_info = splitter.split(np.array([i for i in range(100)]))
 

From 2b6836fe130661d6fe11e3835ab0ff9a3879c5bb Mon Sep 17 00:00:00 2001
From: v715 <vgopala4@jhu.edu>
Date: Tue, 12 Jan 2021 23:40:02 -0500
Subject: [PATCH 4/5] Fix docstring for workers argument

---
 proglearn/transformers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/proglearn/transformers.py b/proglearn/transformers.py
index bacbf4397d..25baa6d3c1 100644
--- a/proglearn/transformers.py
+++ b/proglearn/transformers.py
@@ -343,7 +343,7 @@ class ObliqueSplitter:
     random_state : int
         Controls the pseudo random number generator used to generate the projection matrix.
     workers : int
-        The number of cores to parallelize the p-value computation over.
+        The number of cores to parallelize the calculation of Gini impurity.
         Supply -1 to use all cores available to the Process.
 
     Methods
@@ -984,7 +984,7 @@ class ObliqueTreeClassifier(BaseEstimator):
     density : float
         Density estimate.
     workers : int, optional (default: -1)
-        The number of cores to parallelize the p-value computation over.
+        The number of cores to parallelize the calculation of Gini impurity.
         Supply -1 to use all cores available to the Process.
 
     Methods

From b41e31e9bf3e57df2c37ec87d9ead1e0be806e29 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Wed, 13 Jan 2021 09:13:16 -0500
Subject: [PATCH 5/5] Use specified worker numbers for parallelization

Co-Authored-By: v715 <vgopala4@jhu.edu>
---
 proglearn/transformers.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/proglearn/transformers.py b/proglearn/transformers.py
index 25baa6d3c1..b7f89a3960 100644
--- a/proglearn/transformers.py
+++ b/proglearn/transformers.py
@@ -2,10 +2,9 @@
 Main Author: Will LeVine 
 Corresponding Email: levinewill@icloud.com
 """
-from itertools import product
-
-import keras as keras
+import keras
 import numpy as np
+from itertools import product
 from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
 from sklearn.random_projection import SparseRandomProjection
@@ -565,7 +564,7 @@ def split(self, sample_inds):
 
         # Loop through examples and projected features to calculate split scores
         split_iterator = product(range(1, n_samples - 1), range(self.proj_dims))
-        scores = Parallel(n_jobs=-1)(
+        scores = Parallel(n_jobs=self.workers)(
             delayed(self._score)(proj_X, y_sample, i, j) for i, j in split_iterator
         )
         for gini, i, j in scores: