Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ability to pass driver names to Kruskals contructor #3

Merged
merged 2 commits into from
Apr 5, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Kruskals/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def main():
print('Could not detect file type. Please select one from "csv" or "sav"')

print(Kruskals.from_pandas_df(data, nspace.independent_variables,
nspace.dependent_variable[0]).percentage())
nspace.dependent_variable[0]).driver_score_to_series())

if __name__ == "__main__":
main()
51 changes: 36 additions & 15 deletions Kruskals/kruskals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
from scipy.special import factorial
from itertools import combinations, chain

Expand All @@ -14,13 +15,17 @@ class Kruskals(object):
arr : numpy.ndarray (dtype: float/int)
1-dimensional array of the dependent variable associated with ndarr
"""
def __init__(self, ndarr, arr):
def __init__(self, ndarr, arr, i_vars=None):
self._ndarr = ndarr
self._arr = arr
self._distance = None
self._driver_score = None
self._i_vars = i_vars

if i_vars and len(i_vars) != ndarr.shape[1]:
self._i_vars = None

@staticmethod
def from_pandas_df(df, i_variables, d_variable):
def from_pandas_df(df, i_vars, d_var):
"""
Helper method to pre-process a pandas data frame in order to run Kruskal's algorithm
analysis
Expand All @@ -30,28 +35,37 @@ def from_pandas_df(df, i_variables, d_variable):
df : pandas.DataFrame
the dataframe with the dependent and independent variables in which
to slice from
i_variables : array-like
i_vars : array-like
list of the column names for the independent variables
d_variable : string
d_var : string
the name of the dependent variable in the dataframe
"""
ind_df = df[i_variables]
ind_df = df[i_vars]
ind_values = ind_df.values
dep_values = df[d_variable].values
return Kruskals(ind_values, dep_values)
dep_values = df[d_var].values
return Kruskals(ind_values, dep_values, i_vars)

def driver_score_to_series(self):
"""
Returns the driver score for each variable in the independent set
as a pandas series
"""
series = pd.Series(self.driver_score(), index=self._i_vars)
series.name = 'score'
series.index.name = 'driver'
return series

def distance(self):
def driver_score(self):
"""
Calculate the average distance between a point on the
n-dimensional plane and the other points
Calculate the driver score for all independent variables
"""
if self._distance is None:
if self._driver_score is None:
ind_c, pij, pijm = self.generate_diff(self._ndarr, self._arr)
pij_row_mean = np.nanmean(pij, axis=1) * (ind_c - 1)
fact = factorial(ind_c - 1) / (2 * factorial(ind_c - 3))
pijm_row_mean = np.nanmean(pijm, axis=(0, 2)) * fact
self._distance = (pij_row_mean + pijm_row_mean) / ((ind_c - 1) + fact)
return self._distance
self._driver_score = (pij_row_mean + pijm_row_mean) / ((ind_c - 1) + fact)
return self._driver_score

def percentage(self):
"""
Expand All @@ -61,7 +75,8 @@ def percentage(self):

def generate_diff(self, ndarr, arr):
"""
Internal method to calculate the difference between all points
Internal method to calculate the partial correlation squared between
the independent and the dependent variables
"""
l = ndarr.shape[1]
pij = np.empty((l,l,)) * np.nan
Expand All @@ -79,3 +94,9 @@ def pcor_squared(ndarr):
"""
icvx = np.linalg.inv(np.cov(ndarr))
return (icvx[0, 1] * icvx[0, 1]) / (icvx[0, 0] * icvx[1, 1])

def percentage(self):
"""
Internal method to calculate relative affect on the dependent variable
"""
return self.driver_score() / self.driver_score().sum() * 100
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ from Kruskals import Kruskals
pandas_data_frame = ...
independent_variable_columns = ['a', 'b', 'c']
dep_variable = 'd'
Kruskals.from_pandas_df(df, independent_variable_columns, dep_variable).distance()
Kruskals.from_pandas_df(df, independent_variable_columns, dep_variable).driver_score()
```

Running from the Command Line
Expand Down
42 changes: 31 additions & 11 deletions tests/test_kruskals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import numpy as np
import pandas as pd

def test_distance():
""" Test distance is calculated correctly """
def test_driver_score():
""" Test driver_score is calculated correctly """
ndarr = np.array([
[1, 2, 3, 4, 5, 6],
[6, 5, 4, 3, 8, 1],
Expand All @@ -19,10 +19,10 @@ def test_distance():

arr = np.array([1, 2, 3, 4, 5, 6])

exp_distance = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
distance = np.round(Kruskals.Kruskals(ndarr, arr).distance(), decimals=5)
exp_driver_score = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
driver_score = np.round(Kruskals.Kruskals(ndarr, arr).driver_score(), decimals=5)

assert np.array_equal(distance, exp_distance)
assert np.array_equal(driver_score, exp_driver_score)

def test_from_pandas_df():
""" Test from pandas_df correctly slices the data """
Expand All @@ -35,12 +35,12 @@ def test_from_pandas_df():
[1, 2, 2, 9, 1, 4, 6]
])

exp_distance = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
exp_driver_score = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])

df = pd.DataFrame(ndarr)
distance = np.round(Kruskals.Kruskals.from_pandas_df(df, list(range(6)), 6).distance(), decimals=5)
driver_score = np.round(Kruskals.Kruskals.from_pandas_df(df, list(range(6)), 6).driver_score(), decimals=5)

assert np.array_equal(distance, exp_distance)
assert np.array_equal(driver_score, exp_driver_score)

def test_percentage():
""" Test percentage is calculated correctly """
Expand All @@ -55,7 +55,27 @@ def test_percentage():

arr = np.array([1, 2, 3, 4, 5, 6])

exp_distance = np.array([ 5.90856, 17.81959, 9.62429, 25.08222, 28.85722, 12.70813])
distance = np.round(Kruskals.Kruskals(ndarr, arr).percentage(), decimals=5)
exp_driver_score = np.array([ 5.90856, 17.81959, 9.62429, 25.08222, 28.85722, 12.70813])
driver_score = np.round(Kruskals.Kruskals(ndarr, arr).percentage(), decimals=5)

assert np.array_equal(distance, exp_distance)
assert np.array_equal(driver_score, exp_driver_score)

def test_series_output():
""" Test percentage is calculated correctly """
ndarr = np.array([
[1, 2, 3, 4, 5, 6],
[6, 5, 4, 3, 8, 1],
[1, 1, 9, 1, 1, 1],
[9, 2, 2, 2, 2, 2],
[3, 3, 3, 9, 3, 3],
[1, 2, 2, 9, 1, 4]
])

arr = np.array([1, 2, 3, 4, 5, 6])

exp_driver_score = np.array([ 0.14721, 0.44398, 0.23979, 0.62493, 0.71898, 0.31662])
series = Kruskals.Kruskals(ndarr, arr).driver_score_to_series()

assert np.array_equal(np.round(series.values, decimals=5), exp_driver_score)
assert series.name == 'score'
assert series.index.name == 'driver'