Skip to content

Commit

Permalink
Added method and test to construct a series from the distance metrics (
Browse files Browse the repository at this point in the history
…#3)

* Added method and test

* Switched back import statement
  • Loading branch information
Mark Ramotowski authored Apr 5, 2017
1 parent f7bca74 commit 2f8a1c6
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 28 deletions.
2 changes: 1 addition & 1 deletion Kruskals/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def main():
print('Could not detect file type. Please select one from "csv" or "sav"')

print(Kruskals.from_pandas_df(data, nspace.independent_variables,
nspace.dependent_variable[0]).percentage())
nspace.dependent_variable[0]).driver_score_to_series())

if __name__ == "__main__":
main()
51 changes: 36 additions & 15 deletions Kruskals/kruskals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
from scipy.special import factorial
from itertools import combinations, chain

Expand All @@ -14,13 +15,17 @@ class Kruskals(object):
arr : numpy.ndarray (dtype: float/int)
1-dimensional array of the dependent variable associated with ndarr
"""
def __init__(self, ndarr, arr):
def __init__(self, ndarr, arr, i_vars=None):
self._ndarr = ndarr
self._arr = arr
self._distance = None
self._driver_score = None
self._i_vars = i_vars

if i_vars and len(i_vars) != ndarr.shape[1]:
self._i_vars = None

@staticmethod
def from_pandas_df(df, i_variables, d_variable):
def from_pandas_df(df, i_vars, d_var):
"""
Helper method to pre-process a pandas data frame in order to run Kruskal's algorithm
analysis
Expand All @@ -30,28 +35,37 @@ def from_pandas_df(df, i_variables, d_variable):
df : pandas.DataFrame
the dataframe with the dependent and independent variables in which
to slice from
i_variables : array-like
i_vars : array-like
list of the column names for the independent variables
d_variable : string
d_var : string
the name of the dependent variable in the dataframe
"""
ind_df = df[i_variables]
ind_df = df[i_vars]
ind_values = ind_df.values
dep_values = df[d_variable].values
return Kruskals(ind_values, dep_values)
dep_values = df[d_var].values
return Kruskals(ind_values, dep_values, i_vars)

def driver_score_to_series(self):
"""
Returns the driver score for each variable in the independent set
as a pandas series
"""
series = pd.Series(self.driver_score(), index=self._i_vars)
series.name = 'score'
series.index.name = 'driver'
return series

def distance(self):
def driver_score(self):
"""
Calculate the average distance between a point on the
n-dimensional plane and the other points
Calculate the driver score for all independent variables
"""
if self._distance is None:
if self._driver_score is None:
ind_c, pij, pijm = self.generate_diff(self._ndarr, self._arr)
pij_row_mean = np.nanmean(pij, axis=1) * (ind_c - 1)
fact = factorial(ind_c - 1) / (2 * factorial(ind_c - 3))
pijm_row_mean = np.nanmean(pijm, axis=(0, 2)) * fact
self._distance = (pij_row_mean + pijm_row_mean) / ((ind_c - 1) + fact)
return self._distance
self._driver_score = (pij_row_mean + pijm_row_mean) / ((ind_c - 1) + fact)
return self._driver_score

def percentage(self):
"""
Expand All @@ -61,7 +75,8 @@ def percentage(self):

def generate_diff(self, ndarr, arr):
"""
Internal method to calculate the difference between all points
Internal method to calculate the partial correlation squared between
the independent and the dependent variables
"""
l = ndarr.shape[1]
pij = np.empty((l,l,)) * np.nan
Expand All @@ -79,3 +94,9 @@ def pcor_squared(ndarr):
"""
icvx = np.linalg.inv(np.cov(ndarr))
return (icvx[0, 1] * icvx[0, 1]) / (icvx[0, 0] * icvx[1, 1])

def percentage(self):
"""
Internal method to calculate relative affect on the dependent variable
"""
return self.driver_score() / self.driver_score().sum() * 100
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ from Kruskals import Kruskals
pandas_data_frame = ...
independent_variable_columns = ['a', 'b', 'c']
dep_variable = 'd'
Kruskals.from_pandas_df(df, independent_variable_columns, dep_variable).distance()
Kruskals.from_pandas_df(df, independent_variable_columns, dep_variable).driver_score()
```

Running from the Command Line
Expand Down
42 changes: 31 additions & 11 deletions tests/test_kruskals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import numpy as np
import pandas as pd

def test_distance():
""" Test distance is calculated correctly """
def test_driver_score():
""" Test driver_score is calculated correctly """
ndarr = np.array([
[1, 2, 3, 4, 5, 6],
[6, 5, 4, 3, 8, 1],
Expand All @@ -19,10 +19,10 @@ def test_distance():

arr = np.array([1, 2, 3, 4, 5, 6])

exp_distance = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
distance = np.round(Kruskals.Kruskals(ndarr, arr).distance(), decimals=5)
exp_driver_score = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
driver_score = np.round(Kruskals.Kruskals(ndarr, arr).driver_score(), decimals=5)

assert np.array_equal(distance, exp_distance)
assert np.array_equal(driver_score, exp_driver_score)

def test_from_pandas_df():
""" Test from pandas_df correctly slices the data """
Expand All @@ -35,12 +35,12 @@ def test_from_pandas_df():
[1, 2, 2, 9, 1, 4, 6]
])

exp_distance = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
exp_driver_score = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])

df = pd.DataFrame(ndarr)
distance = np.round(Kruskals.Kruskals.from_pandas_df(df, list(range(6)), 6).distance(), decimals=5)
driver_score = np.round(Kruskals.Kruskals.from_pandas_df(df, list(range(6)), 6).driver_score(), decimals=5)

assert np.array_equal(distance, exp_distance)
assert np.array_equal(driver_score, exp_driver_score)

def test_percentage():
""" Test percentage is calculated correctly """
Expand All @@ -55,7 +55,27 @@ def test_percentage():

arr = np.array([1, 2, 3, 4, 5, 6])

exp_distance = np.array([ 5.90856, 17.81959, 9.62429, 25.08222, 28.85722, 12.70813])
distance = np.round(Kruskals.Kruskals(ndarr, arr).percentage(), decimals=5)
exp_driver_score = np.array([ 5.90856, 17.81959, 9.62429, 25.08222, 28.85722, 12.70813])
driver_score = np.round(Kruskals.Kruskals(ndarr, arr).percentage(), decimals=5)

assert np.array_equal(distance, exp_distance)
assert np.array_equal(driver_score, exp_driver_score)

def test_series_output():
""" Test percentage is calculated correctly """
ndarr = np.array([
[1, 2, 3, 4, 5, 6],
[6, 5, 4, 3, 8, 1],
[1, 1, 9, 1, 1, 1],
[9, 2, 2, 2, 2, 2],
[3, 3, 3, 9, 3, 3],
[1, 2, 2, 9, 1, 4]
])

arr = np.array([1, 2, 3, 4, 5, 6])

exp_driver_score = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
series = Kruskals.Kruskals(ndarr, arr).driver_score_to_series()

assert np.array_equal(np.round(series.values, decimals=5), exp_driver_score)
assert series.name == 'score'
assert series.index.name == 'driver'

0 comments on commit 2f8a1c6

Please sign in to comment.