Added method and test to construct a series from the distance metrics (…

…#3) * Added method and test * Switched back import statement
Rambatino · Apr 5, 2017 · 2f8a1c6 · 2f8a1c6
1 parent f7bca74
commit 2f8a1c6
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 28 deletions.
diff --git a/Kruskals/__main__.py b/Kruskals/__main__.py
@@ -26,7 +26,7 @@ def main():
         print('Could not detect file type. Please select one from "csv" or "sav"')
 
     print(Kruskals.from_pandas_df(data, nspace.independent_variables,
-                                nspace.dependent_variable[0]).percentage())
+                                nspace.dependent_variable[0]).driver_score_to_series())
 
 if __name__ == "__main__":
     main()
diff --git a/Kruskals/kruskals.py b/Kruskals/kruskals.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 from scipy.special import factorial
 from itertools import combinations, chain
 
@@ -14,13 +15,17 @@ class Kruskals(object):
         arr : numpy.ndarray (dtype: float/int)
             1-dimensional array of the dependent variable associated with ndarr
     """
-    def __init__(self, ndarr, arr):
+    def __init__(self, ndarr, arr, i_vars=None):
         self._ndarr = ndarr
         self._arr = arr
-        self._distance = None
+        self._driver_score = None
+        self._i_vars = i_vars
+
+        if i_vars and len(i_vars) != ndarr.shape[1]:
+            self._i_vars = None
 
     @staticmethod
-    def from_pandas_df(df, i_variables, d_variable):
+    def from_pandas_df(df, i_vars, d_var):
         """
         Helper method to pre-process a pandas data frame in order to run Kruskal's algorithm
         analysis
@@ -30,28 +35,37 @@ def from_pandas_df(df, i_variables, d_variable):
         df : pandas.DataFrame
             the dataframe with the dependent and independent variables in which
             to slice from
-        i_variables : array-like
+        i_vars : array-like
             list of the column names for the independent variables
-        d_variable : string
+        d_var : string
             the name of the dependent variable in the dataframe
         """
-        ind_df = df[i_variables]
+        ind_df = df[i_vars]
         ind_values = ind_df.values
-        dep_values = df[d_variable].values
-        return Kruskals(ind_values, dep_values)
+        dep_values = df[d_var].values
+        return Kruskals(ind_values, dep_values, i_vars)
+
+    def driver_score_to_series(self):
+        """
+        Returns the driver score for each variable in the independent set
+        as a pandas series
+        """
+        series = pd.Series(self.driver_score(), index=self._i_vars)
+        series.name = 'score'
+        series.index.name = 'driver'
+        return series
 
-    def distance(self):
+    def driver_score(self):
         """
-        Calculate the average distance between a point on the
-        n-dimensional plane and the other points
+        Calculate the driver score for all independent variables
         """
-        if self._distance is None:
+        if self._driver_score is None:
             ind_c, pij, pijm = self.generate_diff(self._ndarr, self._arr)
             pij_row_mean = np.nanmean(pij, axis=1) * (ind_c - 1)
             fact = factorial(ind_c - 1) / (2 * factorial(ind_c - 3))
             pijm_row_mean = np.nanmean(pijm, axis=(0, 2)) * fact
-            self._distance = (pij_row_mean + pijm_row_mean) / ((ind_c - 1) + fact)
-        return self._distance
+            self._driver_score = (pij_row_mean + pijm_row_mean) / ((ind_c - 1) + fact)
+        return self._driver_score
 
     def percentage(self):
         """
@@ -61,7 +75,8 @@ def percentage(self):
 
     def generate_diff(self, ndarr, arr):
         """
-        Internal method to calculate the difference between all points
+        Internal method to calculate the partial correlation squared between
+        the independent and the dependent variables
         """
         l = ndarr.shape[1]
         pij = np.empty((l,l,)) * np.nan
@@ -79,3 +94,9 @@ def pcor_squared(ndarr):
         """
         icvx = np.linalg.inv(np.cov(ndarr))
         return (icvx[0, 1] * icvx[0, 1]) / (icvx[0, 0] * icvx[1, 1])
+
+    def percentage(self):
+        """
+        Internal method to calculate relative affect on the dependent variable
+        """
+        return self.driver_score() / self.driver_score().sum() * 100
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ from Kruskals import Kruskals
 pandas_data_frame = ...
 independent_variable_columns = ['a', 'b', 'c']
 dep_variable = 'd'
-Kruskals.from_pandas_df(df, independent_variable_columns, dep_variable).distance()
+Kruskals.from_pandas_df(df, independent_variable_columns, dep_variable).driver_score()
 ```
 
 Running from the Command Line

diff --git a/tests/test_kruskals.py b/tests/test_kruskals.py
@@ -6,8 +6,8 @@
 import numpy as np
 import pandas as pd
 
-def test_distance():
-    """ Test distance is calculated correctly """
+def test_driver_score():
+    """ Test driver_score is calculated correctly """
     ndarr = np.array([
       [1, 2, 3, 4, 5, 6],
       [6, 5, 4, 3, 8, 1],
@@ -19,10 +19,10 @@ def test_distance():
 
     arr = np.array([1, 2, 3, 4, 5, 6])
 
-    exp_distance = np.array([ 0.14721,  0.44398,  0.23979,  0.62493,  0.71898,  0.31662])
-    distance = np.round(Kruskals.Kruskals(ndarr, arr).distance(), decimals=5)
+    exp_driver_score = np.array([ 0.14721,  0.44398,  0.23979,  0.62493,  0.71898,  0.31662])
+    driver_score = np.round(Kruskals.Kruskals(ndarr, arr).driver_score(), decimals=5)
 
-    assert np.array_equal(distance, exp_distance)
+    assert np.array_equal(driver_score, exp_driver_score)
 
 def test_from_pandas_df():
     """ Test from pandas_df correctly slices the data """
@@ -35,12 +35,12 @@ def test_from_pandas_df():
       [1, 2, 2, 9, 1, 4, 6]
     ])
 
-    exp_distance = np.array([ 0.14721,  0.44398,  0.23979,  0.62493,  0.71898,  0.31662])
+    exp_driver_score = np.array([ 0.14721,  0.44398,  0.23979,  0.62493,  0.71898,  0.31662])
 
     df = pd.DataFrame(ndarr)
-    distance = np.round(Kruskals.Kruskals.from_pandas_df(df, list(range(6)), 6).distance(), decimals=5)
+    driver_score = np.round(Kruskals.Kruskals.from_pandas_df(df, list(range(6)), 6).driver_score(), decimals=5)
 
-    assert np.array_equal(distance, exp_distance)
+    assert np.array_equal(driver_score, exp_driver_score)
 
 def test_percentage():
     """ Test percentage is calculated correctly """
@@ -55,7 +55,27 @@ def test_percentage():
 
     arr = np.array([1, 2, 3, 4, 5, 6])
 
-    exp_distance = np.array([  5.90856,  17.81959,   9.62429,  25.08222,  28.85722,  12.70813])
-    distance = np.round(Kruskals.Kruskals(ndarr, arr).percentage(), decimals=5)
+    exp_driver_score = np.array([  5.90856,  17.81959,   9.62429,  25.08222,  28.85722,  12.70813])
+    driver_score = np.round(Kruskals.Kruskals(ndarr, arr).percentage(), decimals=5)
 
-    assert np.array_equal(distance, exp_distance)
+    assert np.array_equal(driver_score, exp_driver_score)
+
+def test_series_output():
+    """ Test percentage is calculated correctly """
+    ndarr = np.array([
+      [1, 2, 3, 4, 5, 6],
+      [6, 5, 4, 3, 8, 1],
+      [1, 1, 9, 1, 1, 1],
+      [9, 2, 2, 2, 2, 2],
+      [3, 3, 3, 9, 3, 3],
+      [1, 2, 2, 9, 1, 4]
+    ])
+
+    arr = np.array([1, 2, 3, 4, 5, 6])
+
+    exp_driver_score = np.array([ 0.14721,  0.44398,  0.23979,  0.62493,  0.71898,  0.31662])
+    series = Kruskals.Kruskals(ndarr, arr).driver_score_to_series()
+
+    assert np.array_equal(np.round(series.values, decimals=5), exp_driver_score)
+    assert series.name == 'score'
+    assert series.index.name == 'driver'