-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulticollinearity.py
71 lines (51 loc) · 2.29 KB
/
multicollinearity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from eda import groups, eda_prep
from ml_package import *
if __name__ == "__main__":
##### Load the Data with insights and fixes learned from eda process #####
X, y = eda_prep()
# Find inappropriate features
col_dict = data_prep.find_inapp(X)
# Remove constant features
X.drop(columns=col_dict["Constant"], inplace=True)
# Remove categoric columns with too many unique values
X.drop(columns=col_dict["Unique"], inplace=True)
# Remove categoric columns with too many null values
X.drop(columns=col_dict["Null"], inplace=True)
# Remove categoric columns with too little variance
X.drop(columns=col_dict["Low Variance"], inplace=True)
from sklearn.impute import SimpleImputer
meanImputer = SimpleImputer(strategy="mean")
freqImputer = SimpleImputer(strategy="most_frequent")
# get numeric and categoric columns
num_columns = X.select_dtypes(exclude=["object"]).columns
obj_columns = X.select_dtypes(include=["object"]).columns
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
X.loc[:, num_columns] = meanImputer.fit_transform(X[num_columns])
X.loc[:, obj_columns] = freqImputer.fit_transform(X[obj_columns])
encoder = ce.TargetEncoder(cols=obj_columns, return_df=True)
X = encoder.fit_transform(X, y)
scaler = StandardScaler()
X[num_columns] = scaler.fit_transform(X[num_columns])
X = custom_comps.PCATransformer(X, X, groups)[0]
X["Intercept"] = 1
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_info = pd.DataFrame()
vif_info["VIF"] = [
variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]
vif_info["Column"] = X.columns
vif_info.sort_values("VIF", ascending=False)
vif_info[vif_info["VIF"].values > 5]
import matplotlib.pyplot as plt
import seaborn as sns
plt.clf()
plt.figure(figsize=(18, 18), dpi=80)
# [vif_info[vif_info["VIF"].values > 5]["Column"]]
cor_mat = X.corr()
sns.heatmap(cor_mat)
model = sm.Logit(y, X)
results = model.fit(method="newton", maxiter=1000)
results.summary()