-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathConnectivity_SVR.py
175 lines (121 loc) · 9.84 KB
/
Connectivity_SVR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# SVR script
# Created by Benjamin L Deck
#Last updated: 1.12.2020
#Support Vector Machine: Regression
#We are still concerned with classification issues. In typical SVM we know #that the only data that matters are the support vectors in terms of fitting #the model, we don't care about the data lying beyond the epsilon margin. #Similarly, SVR is also only concerned with a subset of the training data.
#Three different types of support vector regression: SVR, NuSVR and #linearSVR. Linear SVR is a faster implementation but assumes the #relationship between the support vectors is linear and thus only uses #linear kernel. NuSVR uses the parameter nu to control for the number of #support vectors, nu in NuSVR replaces the epsilon in regular SVR/SVM-- nu #is an upper bound on the fraction of training errors and lower bound on the #fraction of support vectors, e.g. if interval set is 0,1 the split is 0.5 #upper and 0.5 lower
#Similar to all other SVM models we need to classify a target vector. The #model will also accept X,y vectors. However, in SVR, the y or target are #float numbers
# This function anticipates an input of two csv files with a row per participant. Each column being mean connectivity. Additionally, The matrix reduction should be performed before inputing features and targets into this function.
# import all of the necessary modules and utilities
def conn_svr(features, targets, type_kernel, cost, epsi):
""" SVR script Created by Benjamin L Deck
We are still concerned with classification issues. In typical SVM we know that the only data that matters are the support vectors in terms of fitting
the model, we don't care about the data lying beyond the epsilon margin. Three different types of support vector regression: SVR, NuSVR and linearSVR.
Linear SVR is a faster implementation but assumes the relationship between the support vectors is linear and thus only uses a linear kernel. NuSVR uses the parameter nu to control for the number of support vectors, nu in NuSVR replaces the epsilon in regular SVR/SVM-- nu is an upper bound on the fraction of training errors and lower bound on the fraction of support vectors, e.g. if interval set is 0,1 the split is 0.5 upper and 0.5 lower
Similar to most other machine learning paradigms in Sci-kit learn, we need to classify a target vector. The model will also accept X,y vectors. However, in SVR, the y or target are continuous.
This function anticipates an input of two csv files with a row per participant. Each column being mean connectivity. Additionally, The matrix reduction should be performed before inputing features and targets into this function.
features -- a csv of all features
targets -- a csv of all targets
typ_kernel -- a string denoting the type of kernel to apply as a hyper-parameter (i.e. 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed')
cost -- the cost parameter (30 is recommended by Zhang et al., 2014; DeMarco et al., 2018)
epsi -- the epsilon value the area surrounding the hyperplane in which there is no penalty to the r-squared value (float) (recommended is 0.1)
"""
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.model_selection import permutation_test_score
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import timeit
import time
# Turning all features/targets into float64, print all feature/target types
#df_features = features.drop(features.index[[0]])
df_features = features.astype(float)
print(df_features.dtypes)
df_targets = targets.astype(float)
# convert features pandas dataframe to numpy array
X = df_features.to_numpy()
X # print the features to examine
# convert the target vector to numpy array
y = df_targets.to_numpy()
y[1:20] # print the first 20 entries of array
## Create the training and test sets (this step is optional) if conducting permutation testing this step is not necessary
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=20)
#print(X_train.shape)
#print(X_test.shape)
#print(y_train.shape)
#print(y_test.shape)
## Feature Scaling
# Here we will standardize the scale of each feature set. We do this becasue the scale or measurement units for each feature will not be uniform and thus we may end up with data points that should not necessarily be combined. We may have variables which vary much more than another. In this instance we may have a variable such as GRE score which has a much higher range and std compared with research or Letters of recommendation (LOR). In order to account for this we need to conduct feature scaling which will normalize and standardize all the measured features and center them around 0. This decreases the chance that one variable which has high variability inadvertently acccounts for a disproportionate amount of the variance in the model. For an example see [here](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html). Also if interested in the understanding why the scaler doesn't get the scaled mean or your features to exactly '0' then read [this](https://stackoverflow.com/questions/40405803/mean-of-data-scaled-with-sklearn-standardscaler-is-not-zero).
#
X_scaled= preprocessing.scale(X)
print(X_scaled)
print(X_scaled.mean()) # this will most likely not be exactly zero as there is a limit to how close our computational abilities can get us to 'absolute zero'.
print(X_scaled.std())
scaler = preprocessing.StandardScaler().fit(X_scaled)
print(scaler)
print(scaler.mean_)
scaler.transform(X_scaled)
y_scaled=preprocessing.scale(y)
print(y_scaled)
print(y_scaled.mean()) # this will most likely not be exactly zero as there is a limit to how close our computational abilities can get us to 'absolute zero'.
print(y_scaled.std())
#If not using permutation testing do this next step where feature scaling will be applied to both test and train data
# scaler= preprocessing.StandardScaler().fit(X_train)
# print(scaler)
# print(scaler.mean_)
# scaler.transform(X_train)
# scaler.transform(X_test)
## Creating the Hypothesis driven SVR model
svr = SVR(kernel= type_kernel, epsilon = epsi, C = cost) # this creates an SVR model using the linear function. C = 30 given by Zhang et al., 2014 and DeMarco + Turkeltaub 2018. Epsilon is the margin of error around the hyperplane where the model is not penalized for missclassified data points. C = cost which is the penalty for missclassification where a higher cost encourages more support vector point making a more complex fix to the data.
svr.fit(X_scaled, y_scaled.ravel())
#svr.predict(y_test) # use this if training and testing split models
pred_svr = svr.predict(X_scaled)
# Obtain the r-squared value for the overall model.
hyp_score = svr.score(X_scaled, y_scaled)
hyp_coef= svr.coef_
hyp_params = svr.get_params(deep=True)
###### Permutation testing
#Here we need to shuffle (permute) the target vector (behavior/clinical scale)
perm_svr = SVR(kernel= type_kernel,epsilon = epsi, C = cost)
beta_arr = np.array([]) # beta weights as they are returned by fitting the model
count = 0
r_squared_arr = np.array([])
weights = np.array([])
#a while loop which first initializes the count at 0. Then the while loop is:
start = time.time()
for ii in range(10000):
y_scaled = shuffle(y_scaled) # Will randomly permute the y-target vector
perm_svr.fit(X_scaled, y_scaled.ravel()) # fits the randomly permuted target vector to the feature set (connections between and within each region)
beta_arr = np.append(beta_arr, perm_svr.coef_) # appends the beta weights of each feature (i.e. connections between and within each region)
r_squared_arr = np.append(r_squared_arr, perm_svr.score(X_scaled, y_scaled)) # appends the r^2 value for each model which will create the null distribution
count = count + 1 # iterative counter which if ii = >=10000 it will kill the loop.
end = time.time()
print(end-start)
plt.hist(r_squared_arr, bins=500);
plt.axvline(np.median(r_squared_arr), color='k', linestyle='dashed', linewidth=2)
plt.axvline(hyp_score, color='green', linestyle='solid', linewidth=2)
plt.show()
print(hyp_score)
# Here we need to calculate the p-value for the comparison between the null distribution and the hypothesized model (the original target model that we ran).
num_mods = count + 1
#print(num_mods) # total number of models run including the hypothesized model
num_obs_btr_than_hyp = 0
for ii in r_squared_arr: # this for loop will give you the number of r^2 values which are greater than your hypothesized model
if ii > hyp_score:
num_obs_btr_than_hyp = num_obs_btr_than_hyp + 1
print(num_obs_btr_than_hyp)
num_obs_btr_than_hyp = num_obs_btr_than_hyp + 1 # Adding the original model to the int list to calculate the p-value
p_value = num_obs_btr_than_hyp/num_mods # this division calculates the probability that hypotheized score is significantly different from the null distribution.
return(hyp_params)
return(hyp_score)
return(hyp_coef)
print('Hypothesized model:', str(hyp_score))
print('p-value for model:', str(p_value))
print('number of models better than hypothesized model:', str(num_obs_btr_than_hyp))