-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKFoldCrossValidation_Study.R
122 lines (98 loc) · 5.62 KB
/
KFoldCrossValidation_Study.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#K-Fold cross-validation
#Compare results from different values of K
#KNN, RandomForest, GLM, or SVN
#libraries
#library(dplyr) # for data manipulation
library(caret) # for model-building
#library(DMwR) # for smote implementation
#library(purrr) # for functional programming (map)
install.packages("pROC")
library(pROC) # for AUC calculations
####################
#Import data
source("C://Users//chris//OneDrive//Documentos//GitHub//ML_VotingAggregation//aggregateAnswerOptionsPerQuestion.R");
summaryTable <- runMain();
#summaryTable <- data.frame(summaryTable);
#I need to guarantee that some examples (i.e., failing methods)
#do not dominate the training or testing sets. To do that, I need to get a
#close to equal proportion of examples in both sets
#Scramble the dataset before extracting the training set.
set.seed(8850);
g<- runif((nrow(summaryTable))); #generates a random distribution
summaryTable <- summaryTable[order(g),];
##################################################
# Create trainControl to be reused by all models #
#convert columns to numeric
summaryTable<- data.frame(summaryTable, stringsAsFactors = FALSE)
summaryTable[,"rankingVote"] <- as.numeric(unlist(summaryTable[,"rankingVote"]));
summaryTable[,"Yes.Count"] <- as.numeric(unlist(summaryTable[,"Yes.Count"]));
summaryTable[,"majorityVote"] <- as.numeric(unlist(summaryTable[,"majorityVote"]));
summaryTable[,"explanatoryVariable"] <- summaryTable[,"majorityVote"];
summaryTable$bugCoveringLabels <- as.character(summaryTable$bugCovering);
summaryTable$bugCoveringLabels<- replace(summaryTable$bugCoveringLabels,summaryTable$bugCoveringLabels=="FALSE", "F");
summaryTable$bugCoveringLabels<- replace(summaryTable$bugCoveringLabels,summaryTable$bugCoveringLabels=="TRUE", "T");
summaryTable$bugCoveringLabels<- as.factor(summaryTable$bugCoveringLabels);
## Table to store the outcomes from the training and model selection and prediction
outcome <- matrix(ncol = 12, nrow = 40);
colnames(outcome)<- c("kfolds","trainingError","AUC","accuracy","trueNegatives","truePositives",
"falseNegatives","falsePositives","precision","recall","specificity","sensitivity");
#for(folds in 2:40){
folds <- 10;
# Create custom indices: myFolds
#Guarantees that we are going to use the exact same datasets for all models
myFolds <- createFolds(summaryTable[,"explanatoryVariable"] , k = folds );
#larger K implies less bias (but more overfitting). However, larger K implies larger variance (overfitting),
#i.e., modeling noise, which makes the prediction to presentlarge variation.
#The reason for this is that larger K makes each training data large and very similar, while at the same
#time makes the testing data very dissimilar.
#nice explanation here: https://stats.stackexchange.com/questions/27730/choice-of-k-in-k-fold-cross-validation
# Create reusable trainControl object: myControl
kFoldControl <- trainControl(
index = myFolds, #Train with k folds and validate with one
classProbs = TRUE, # IMPORTANT!
verboseIter = TRUE, #
savePredictions = TRUE, #
summaryFunction = twoClassSummary
);
#knnModel <- train(bugCoveringLabels ~ explanatoryVariable,summaryTable, method="knn", trControl=kFoldControl);
#rfModel<- train(bugCoveringLabels ~ explanatoryVariable,summaryTable, method="rf", trControl=kFoldControl);
#bayesglmModel<- train(bugCoveringLabels ~ explanatoryVariable,summaryTable, method="bayesglm", trControl=kFoldControl);
svmLinear <- train(bugCoveringLabels ~ explanatoryVariable,summaryTable, method="svmLinear", trControl=kFoldControl);
svmLinear2 <- train(bugCoveringLabels ~ explanatoryVariable,summaryTable, method="svmLinear2", trControl=kFoldControl);
svmLinearWeights <- train(bugCoveringLabels ~ explanatoryVariable,summaryTable, method="svmLinearWeights", trControl=kFoldControl, metric="Spec");
fitModel <- train(bugCoveringLabels ~ explanatoryVariable,summaryTable,
method="rf", trControl=kFoldControl, metric="Sens");
#check if n changes if I optimize for False Negatives (Sensitivity), but need to know what is the Positive class for training
fitModel
bugCoveringPredicted <- predict(fitModel,newdata = summaryTable);
matrixResult<- confusionMatrix(data=bugCoveringPredicted,summaryTable$bugCoveringLabels, positive="T");
trueNegatives<- matrixResult$table[1,1];
truePositives<- matrixResult$table[2,2];
falseNegatives<- matrixResult$table[1,2];
falsePositives<- matrixResult$table[2,1];
#compute AUC
aucValue<- roc(response = as.numeric(summaryTable$bugCoveringLabels),
predictor = as.numeric(bugCoveringPredicted)) %>% auc();
aucValue<-as.numeric(aucValue);
accuracy <- (truePositives + trueNegatives) / (truePositives + trueNegatives + falsePositives + falseNegatives);
trainingError <- 1-accuracy;
precision <- truePositives / (truePositives + falsePositives);
recall <- truePositives / (truePositives + falseNegatives);
specificity <- trueNegatives / (trueNegatives + falsePositives);
sensitivity <- truePositives / (truePositives + falseNegatives);
row <- folds-1;
#row<-1
outcome[row,"kfolds"]<-folds;
outcome[row,"trainingError"]<-trainingError;
outcome[row,"AUC"] <- aucValue;
outcome[row,"accuracy"]<-accuracy;
outcome[row,"trueNegatives"]<-trueNegatives;
outcome[row,"truePositives"]<-truePositives;
outcome[row,"falseNegatives"]<-falseNegatives;
outcome[row,"falsePositives"]<-falsePositives;
outcome[row,"precision"]<-precision;
outcome[row,"recall"]<-recall;
outcome[row,"sensitivity"]<-sensitivity;
outcome[row,"specificity"]<-specificity;
#}
write.csv(outcome, file = ".//kfold-study/rf_sens_kfold_study.csv");