-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn_votingAggregation.R
77 lines (57 loc) · 2.48 KB
/
knn_votingAggregation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#K-nearest neighbor KNN
#Predict bug covering questions based on various values of
#the parameters in the aggregation methods
#Obtain the data
# Import data
source("C://Users//chris//OneDrive//Documentos//GitHub//ML_VotingAggregation//aggregateAnswerOptionsPerQuestion.R");
summaryTable <- runMain();
#I need to guarantee that some examples (i.e., failing methods)
#do not dominate the training or testing sets. To do that, I need to get a
#close to equal proportion of examples in both sets. One way to do that is
#to shuffle the data and sample from it.
set.seed(9850);
g<- runif((nrow(summaryTable))); #generates a random distribution
summaryTable <- summaryTable[order(g),];
###########################################################
# Below are two options to partition the data in training and testing sets
#Option-1 Training set (2/3)
totalData = length(summaryTable$Question.ID);
trainingSize = trunc(totalData * 2/3);
startTestIndex = totalData - trainingSize;
endTestIndex = totalData;
#Extract training and test data
trainingData = as.data.frame(summaryTable[1:trainingSize,]);
testData = as.data.frame(summaryTable[startTestIndex:endTestIndex,])
##################################################################
#Option-2 Mark sample with a probability
set.seed(4321);
ind<- sample(2, nrow(summaryTable),replace = TRUE, prob=c(0.67,0.33));
trainingData <- as.data.frame(summaryTable[ind==1,]);
testData <- as.data.frame(summaryTable[ind==2,]);
##Obtain the ground truth
trainingLabels <- as.data.frame(summaryTable[ind==1,"bugCovering"]);
testLabels <- as.data.frame(summaryTable[ind==2,"bugCovering"]);
##################################################################
#Build the KNN model
install.packages("class")
library(class);
#Select only the rankingVote as a feature to predict bugCovering
summaryTable <- summaryTable[,c("bugCovering","rankingVote")];
fitModel <- knn(train =trainingData, test=testData, cl=trainingLabels[,1] , k=2);
attributes(.Last.value)
summary(fitModel);
#Evaluate model
testLabels<-data.frame(testLabels[,1]);
merge <- data.frame(fitModel,testLabels);
names(merge)<- c("Predicted bug","Actual bug");
merge
install.packages("gmodels");
library(gmodels)
CrossTable(x = trainingData[,"bugCovering"], y=fitModel, prop.chisq = FALSE)
plot(fitModel)
fitModel
fitFrame <- data.frame(fitModel)
predictionFrame<-data.frame(fitModel)
mean(trainingData[predictionFrame[,1]==TRUE,"rankingVote"])
trainingData[predictionFrame[,1]==TRUE,]
predictionFrame[predictionFrame[,1]==TRUE,]