set random state for the result to be reproducible

xiw315 · Feb 6, 2020 · 85f487c · 85f487c
1 parent 3adcd55
commit 85f487c
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 14 deletions.
diff --git a/results/analysis_result.csv b/results/analysis_result.csv
@@ -1,18 +1,17 @@
-
 method,Random Forest
-Best_max_depth,[3]
-Best_min_samples_split,[3]
-Best_CV_Score,[0.7045454545454546]
-Training_Error,[0.25909090909090904]
+Best_max_depth,[4]
+Best_min_samples_split,[2]
+Best_CV_Score,[0.6954545454545454]
+Training_Error,[0.2090909090909091]
 Validation_Error,[0.3035714285714286]
 method,Decision Tree
 Best_max_depth,[7]
-Bestmin_samples_split,[3]
-Best_CV_Score,[0.6545454545454545]
-Training_Error,[0.1454545454545455]
-Validation_Error,[0.4107142857142857]
+Bestmin_samples_split,[4]
+Best_CV_Score,[0.6590909090909091]
+Training_Error,[0.15000000000000002]
+Validation_Error,[0.3571428571428571]
 method,Logistic regression
 C,[1]
 Best_CV_Score,[0.7090909090909091]
 Training_Error,[0.2727272727272727]
-Validation_Error,[0.3214285714285714]
+Validation_Error,[0.3214285714285714]
diff --git a/src/analysis.py b/src/analysis.py
@@ -44,7 +44,7 @@ def main(train_data, local_path):
 
 
     #fit and predict using decision tree classifer and gridCV to find best max_depth hyperparameter
-    model = DecisionTreeClassifier()
+    model = DecisionTreeClassifier(random_state= 100)
     param_grid = {'max_depth': list(range(2,15)),'min_samples_split':list(range(2,5))}
     CV = GridSearchCV(model, param_grid, cv = 10, refit=True)          
     CV.fit(X_train, y_train)
@@ -58,7 +58,7 @@ def main(train_data, local_path):
 
 
     # try random forest model
-    forest = RandomForestClassifier(n_estimators =100)
+    forest = RandomForestClassifier(n_estimators =100, random_state=100)
     param_grid_f = {'max_depth': list(range(2,15)),'min_samples_split':list(range(2,5))}
                     #'n_estimators': list(range(80,120))}
 
@@ -75,7 +75,7 @@ def main(train_data, local_path):
 
 
     #try logistic regression
-    model_r = LogisticRegression(solver ='lbfgs')
+    model_r = LogisticRegression(solver ='lbfgs',random_state=100)
     param_grid_r = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
     CV_r = GridSearchCV(model_r, param_grid_r, cv = 10, refit=True)          
     CV_r.fit(X_train, y_train['Class'])
@@ -90,7 +90,7 @@ def main(train_data, local_path):
     model_score =  pd.concat([d_2, d_r])
 
     #find out the best one is logistic regression, so we do a confusion matrix
-    model_better = LogisticRegression(solver ='lbfgs', C=CV_r.best_params_['C'])
+    model_better = LogisticRegression(solver ='lbfgs', C=CV_r.best_params_['C'],random_state=100)
     model_better.fit(X_train, y_train['Class'])
     y_pred = model_better.predict(X_valid)
     confusion = pd.DataFrame(confusion_matrix(y_valid, y_pred))