Skip to content

Commit

Permalink
update analysis.py add two more model and confusion matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
xiw315 committed Jan 31, 2020
1 parent 254f679 commit cc48cd2
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 8 deletions.
3 changes: 3 additions & 0 deletions results/analysis_confusion.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,0,1
0,20,7
1,11,18
19 changes: 17 additions & 2 deletions results/analysis_result.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,17 @@
,Best_max_depth,Best_CV_Score,Training_Error,Validation_Error
0,7,0.65,0.14090909090909087,0.3571428571428571
method,Random Forest
Best_max_depth,[5]
Best_min_samples_split,[2]
Best_CV_Score,[0.6954545454545454]
Training_Error,[0.15909090909090906]
Validation_Error,[0.2678571428571429]
method,Decision Tree
Best_max_depth,[7]
Bestmin_samples_split,[4]
Best_CV_Score,[0.6636363636363637]
Training_Error,[0.15000000000000002]
Validation_Error,[0.3928571428571429]
method,Logistic regression
C,[1]
Best_CV_Score,[0.7090909090909091]
Training_Error,[0.2727272727272727]
Validation_Error,[0.3214285714285714]
60 changes: 54 additions & 6 deletions src/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@
"""

import pandas as pd
import numpy as np
from docopt import docopt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

opt = docopt(__doc__)

Expand All @@ -39,17 +43,61 @@ def main(train_data, local_path):

#fit and predict using decision tree classifer and gridCV to find best max_depth hyperparameter
model = DecisionTreeClassifier()
param_grid = {'max_depth': list(range(2,15))}
param_grid = {'max_depth': list(range(2,15)),'min_samples_split':list(range(2,5))}
CV = GridSearchCV(model, param_grid, cv = 10, refit=True)
CV.fit(X_train, y_train)
d = {'Best_max_depth':[CV.best_params_['max_depth']],
'Best_CV_Score':[CV.best_score_],
'Training_Error':[1 - CV.score(X_train, y_train)],
'Validation_Error':[1 - CV.score(X_valid, y_valid)]}
model_score = pd.DataFrame(d)
d = {'method':'Decision Tree',
'Best_max_depth':[CV.best_params_['max_depth']],
'Bestmin_samples_split':[CV.best_params_['min_samples_split']],
'Best_CV_Score':[CV.best_score_],
'Training_Error':[1 - CV.score(X_train, y_train)],
'Validation_Error':[1 - CV.score(X_valid, y_valid)]}
d=pd.Series(d)


# try random forest model
forest = RandomForestClassifier(n_estimators =100)
param_grid_f = {'max_depth': list(range(2,15)),'min_samples_split':list(range(2,5))}
#'n_estimators': list(range(80,120))}

CV_f = GridSearchCV(forest, param_grid_f, cv = 5, refit=True)
CV_f.fit(X_train, np.ravel(y_train))
d_f = { 'method':'Random Forest',
'Best_max_depth':[CV_f.best_params_['max_depth']],
'Best_min_samples_split':[CV_f.best_params_['min_samples_split']],
#'Best_n_estimators': [CV_f.best_params_['n_estimators']],
'Best_CV_Score':[CV_f.best_score_],
'Training_Error':[1 - CV_f.score(X_train, y_train)],
'Validation_Error':[1 - CV_f.score(X_valid, y_valid)]}
d_f=pd.Series(d_f)


#try logistic regression
model_r = LogisticRegression(solver ='lbfgs')
param_grid_r = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
CV_r = GridSearchCV(model_r, param_grid_r, cv = 10, refit=True)
CV_r.fit(X_train, y_train['Class'])
d_r = {'method':'Logistic regression',
'C':[CV_r.best_params_['C']],
'Best_CV_Score':[CV_r.best_score_],
'Training_Error':[1 - CV_r.score(X_train, y_train)],
'Validation_Error':[1 - CV_r.score(X_valid, y_valid)]}
d_r= pd.Series(d_r)

d_2=pd.concat([d_f,d])
model_score = pd.concat([d_2, d_r])

#find out the best one is logistic regression, so we do a confusion matrix
model_better = LogisticRegression(solver ='lbfgs', C=CV_r.best_params_['C'])
model_better.fit(X_train, y_train['Class'])
y_pred = model_better.predict(X_valid)
confusion = pd.DataFrame(confusion_matrix(y_valid, y_pred))



#write to csv
model_score.to_csv("%s/analysis_result.csv" % local_path)
confusion.to_csv("%s/analysis_confusion.csv" % local_path)


if __name__ == "__main__":
Expand Down

0 comments on commit cc48cd2

Please sign in to comment.