-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_gbm.R
75 lines (62 loc) · 2.88 KB
/
run_gbm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
library(gbm)
run_gbm <- function(features.split, filestem="", distribution="multinomial", n.trees=1) {
matrices = list()
for (set_no in 1:length(features.split)) {
print(paste0("Starting run_gbm round ", set_no, " at ", date()))
# Get all except one portion as a training group
features <- rbindlist(features.split[-set_no], use.names=TRUE)
is_poetry <- rbindlist(features.split[-set_no], use.names=TRUE)$is_poetry
features$is_poetry <- is_poetry
# On the fly! (Part 1)
if ("author" %in% names(features)) {
poetry_authors <- features$author[which(is_poetry=="POETRY")]
#author <- features$author
features$author <- is_known_author(features$author,
poetry_authors=poetry_authors,
ignore_NA=TRUE)
} else if ("varia_author" %in% names(features)) {
poetry_authors <- features$varia_author[which(is_poetry=="POETRY")]
#author <- features$author
features$varia_author <- is_known_author(features$varia_author,
poetry_authors=poetry_authors,
ignore_NA=TRUE)
}
x <- subset(features, select=-is_poetry)
y <- is_poetry
gbm_model <- gbm(is_poetry ~ ., data=features, distribution=distribution, n.trees=n.trees)
#summary(oner_model)
# Get the last portion as the test group
features2 <- rbindlist(features.split[set_no])
is_poetry2 <- features2$is_poetry
# On the fly! (Part 2)
if ("author" %in% names(features)) {
features2$author <- is_known_author(features2$author,
poetry_authors=poetry_authors,
ignore_NA=TRUE)
} else if ("varia_author" %in% names(features)) {
features2$varia_author <- is_known_author(features2$varia_author,
poetry_authors=poetry_authors,
ignore_NA=TRUE)
}
x2 <- subset(features2, select=-is_poetry)
y2 <- is_poetry2
prob2 <- predict.gbm(gbm_model,x2, n.trees=n.trees)
pred2 <- colnames(prob2)[apply(prob2,1,which.max)]
matrices[[set_no]] <- table(pred2,y2)
# Get variable_importance and print it
if (set_no == 1) {
#png(filename = paste0(outputpath, "/", filestem, "variable_importance_", set_no, ".png"))
#varImpPlot(svm_model, sort=TRUE, main="Variable importance")
#dev.off()
}
cm <- confusionMatrix(data=pred2, reference=is_poetry2, positive="TRUE")
matrices[[set_no]] <- cm
gc()
}
sink(file = paste0(outputpath, "/", filestem ,"confusionMatrix_combined.txt"),
append=FALSE)
aggregated_results <- aggregate_confusion_matrix(matrices)
print(aggregated_results)
sink()
return(aggregated_results)
}