update analysis.py add two more model and confusion matrix

xiw315 · Jan 31, 2020 · cc48cd2 · cc48cd2
1 parent 254f679
commit cc48cd2
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 8 deletions.
diff --git a/results/analysis_confusion.csv b/results/analysis_confusion.csv
@@ -0,0 +1,3 @@
+,0,1
+0,20,7
+1,11,18
diff --git a/results/analysis_result.csv b/results/analysis_result.csv
@@ -1,2 +1,17 @@
-,Best_max_depth,Best_CV_Score,Training_Error,Validation_Error
-0,7,0.65,0.14090909090909087,0.3571428571428571
+method,Random Forest
+Best_max_depth,[5]
+Best_min_samples_split,[2]
+Best_CV_Score,[0.6954545454545454]
+Training_Error,[0.15909090909090906]
+Validation_Error,[0.2678571428571429]
+method,Decision Tree
+Best_max_depth,[7]
+Bestmin_samples_split,[4]
+Best_CV_Score,[0.6636363636363637]
+Training_Error,[0.15000000000000002]
+Validation_Error,[0.3928571428571429]
+method,Logistic regression
+C,[1]
+Best_CV_Score,[0.7090909090909091]
+Training_Error,[0.2727272727272727]
+Validation_Error,[0.3214285714285714]
diff --git a/src/analysis.py b/src/analysis.py
@@ -12,12 +12,16 @@
 """
 
 import pandas as pd
+import numpy as np
 from docopt import docopt
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
 from sklearn.utils import resample
 from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import confusion_matrix 
 
 opt = docopt(__doc__)
 
@@ -39,17 +43,61 @@ def main(train_data, local_path):
 
     #fit and predict using decision tree classifer and gridCV to find best max_depth hyperparameter
     model = DecisionTreeClassifier()
-    param_grid = {'max_depth': list(range(2,15))}
+    param_grid = {'max_depth': list(range(2,15)),'min_samples_split':list(range(2,5))}
     CV = GridSearchCV(model, param_grid, cv = 10, refit=True)          
     CV.fit(X_train, y_train)
-    d = {'Best_max_depth':[CV.best_params_['max_depth']], 
-         'Best_CV_Score':[CV.best_score_], 
-         'Training_Error':[1 - CV.score(X_train, y_train)], 
-         'Validation_Error':[1 - CV.score(X_valid, y_valid)]}
-    model_score = pd.DataFrame(d)
+    d = {'method':'Decision Tree',
+        'Best_max_depth':[CV.best_params_['max_depth']], 
+        'Bestmin_samples_split':[CV.best_params_['min_samples_split']],
+        'Best_CV_Score':[CV.best_score_], 
+        'Training_Error':[1 - CV.score(X_train, y_train)], 
+        'Validation_Error':[1 - CV.score(X_valid, y_valid)]}
+    d=pd.Series(d)
+
+
+    # try random forest model
+    forest = RandomForestClassifier(n_estimators =100)
+    param_grid_f = {'max_depth': list(range(2,15)),'min_samples_split':list(range(2,5))}
+                    #'n_estimators': list(range(80,120))}
+
+    CV_f = GridSearchCV(forest, param_grid_f, cv = 5, refit=True)          
+    CV_f.fit(X_train, np.ravel(y_train))
+    d_f = { 'method':'Random Forest',
+        'Best_max_depth':[CV_f.best_params_['max_depth']], 
+        'Best_min_samples_split':[CV_f.best_params_['min_samples_split']],
+        #'Best_n_estimators': [CV_f.best_params_['n_estimators']],
+        'Best_CV_Score':[CV_f.best_score_], 
+        'Training_Error':[1 - CV_f.score(X_train, y_train)], 
+        'Validation_Error':[1 - CV_f.score(X_valid, y_valid)]}
+    d_f=pd.Series(d_f)
+
+
+    #try logistic regression
+    model_r = LogisticRegression(solver ='lbfgs')
+    param_grid_r = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
+    CV_r = GridSearchCV(model_r, param_grid_r, cv = 10, refit=True)          
+    CV_r.fit(X_train, y_train['Class'])
+    d_r = {'method':'Logistic regression',
+        'C':[CV_r.best_params_['C']], 
+        'Best_CV_Score':[CV_r.best_score_], 
+        'Training_Error':[1 - CV_r.score(X_train, y_train)], 
+        'Validation_Error':[1 - CV_r.score(X_valid, y_valid)]}
+    d_r= pd.Series(d_r)
+
+    d_2=pd.concat([d_f,d])
+    model_score =  pd.concat([d_2, d_r])
+
+    #find out the best one is logistic regression, so we do a confusion matrix
+    model_better = LogisticRegression(solver ='lbfgs', C=CV_r.best_params_['C'])
+    model_better.fit(X_train, y_train['Class'])
+    y_pred = model_better.predict(X_valid)
+    confusion = pd.DataFrame(confusion_matrix(y_valid, y_pred))
+
+
 
     #write to csv
     model_score.to_csv("%s/analysis_result.csv" % local_path)
+    confusion.to_csv("%s/analysis_confusion.csv" % local_path)
 
 
 if __name__ == "__main__":
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    ,0,1
+,20,7
+,11,18