-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_varimp_ranks.R
119 lines (104 loc) · 4.72 KB
/
get_varimp_ranks.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
get_varimp_ranks <- function(filepath,
filenamestem,
outputfile="",
thresholds=c(0.01,0.015,100,100,100,100)) {
if (outputfile=="") {
outputfilepath <- paste0(filepath, "/", filenamestem, "_all_varimps.csv" )
} else {
outputfilepath <- paste0(filepath, "/", outputfile)
}
if (file.exists(paste0(filepath, "/", filenamestem, "__variable_importance_caret.txt"))) {
caret_file <- paste0(filepath, "/", filenamestem, "__variable_importance_caret.txt")
} else {
caret_file <- paste0(filepath, "/", filenamestem, "_caret_ntree250_mtry10__variable_importance_caret.txt")
}
# Caret rank
tmp <- read.csv2(file = caret_file,
header = TRUE,
quote = "",
sep = "",
stringsAsFactors = FALSE,
row.names = 1
)
df_tmp <- data.frame(tmp, stringsAsFactors = FALSE)
df_tmp$caret_rank <- NA
order.scores <- order(df_tmp$NONPOETRY, row.names(df_tmp))
df_tmp$caret_rank[order.scores] <- 1:nrow(df_tmp)
df_tmp$caret_threshold <- ""
df_tmp$caret_threshold[which(df_tmp$NONPOETRY >= thresholds[1])] <- "TRUE"
measures <- df_tmp
# Conditional rank
if (file.exists(paste0(filepath, "/", filenamestem, "__variable_importance_conditional.txt"))) {
conditional_file <- paste0(filepath, "/", filenamestem, "__variable_importance_conditional.txt")
} else {
conditional_file <- paste0(filepath, "/", filenamestem, "_caret_ntree250_mtry10__variable_importance_conditional.txt")
}
tmp <- read.csv2(file = conditional_file,
header = TRUE,
quote = "",
sep = "",
stringsAsFactors = FALSE,
row.names = 1
)
df_tmp <- data.frame(tmp, stringsAsFactors = FALSE)
df_tmp$conditional_rank <- NA
order.scores <- order(df_tmp$conditional_importance, row.names(df_tmp))
df_tmp$conditional_rank[order.scores] <- 1:nrow(df_tmp)
df_tmp$conditional_threshold <- ""
df_tmp$conditional_threshold[which(df_tmp$conditional_importance >= thresholds[2])] <- "TRUE"
measures$conditional_importance <- df_tmp$conditional_importance
measures$conditional_rank <- df_tmp$conditional_rank
measures$conditional_threshold <- df_tmp$conditional_threshold
# randomForest ranks
if (file.exists(paste0(filepath, "/", filenamestem, "__variable_importance_randomForest_no_cut.txt"))) {
rf_file <- paste0(filepath, "/", filenamestem, "__variable_importance_randomForest_no_cut.txt")
} else {
rf_file <- paste0(filepath, "/", filenamestem, "_caret_ntree250_mtry10__variable_importance_randomForest_no_cut.txt")
}
tmp <- read.csv2(file = rf_file,
header = TRUE,
quote = "",
sep = "",
stringsAsFactors = FALSE,
row.names = 1
)
df_tmp <- data.frame(tmp, stringsAsFactors = FALSE)
# NONPOETRY
df_tmp$nonpoetry_rank <- NA
order.scores <- order(as.numeric(df_tmp$NONPOETRY), row.names(df_tmp))
df_tmp$nonpoetry_rank[order.scores] <- 1:nrow(df_tmp)
df_tmp$nonpoetry_threshold <- ""
df_tmp$nonpoetry_threshold[which(as.integer(df_tmp$NONPOETRY) >= thresholds[3])] <- "TRUE"
measures$nonpoetry <- df_tmp$NONPOETRY
measures$nonpoetry_rank <- df_tmp$nonpoetry_rank
measures$nonpoetry_threshold <- df_tmp$nonpoetry_threshold
# POETRY
df_tmp$poetry_rank <- NA
order.scores <- order(as.numeric(df_tmp$POETRY), row.names(df_tmp))
df_tmp$poetry_rank[order.scores] <- 1:nrow(df_tmp)
df_tmp$poetry_threshold <- ""
df_tmp$poetry_threshold[which(as.integer(df_tmp$POETRY) >= thresholds[4])] <- "TRUE"
measures$poetry <- df_tmp$POETRY
measures$poetry_rank <- df_tmp$poetry_rank
measures$poetry_threshold <- df_tmp$poetry_threshold
# MeanDecreaseAccuracy
df_tmp$MDA_rank <- NA
order.scores <- order(as.numeric(df_tmp$MeanDecreaseAccuracy), row.names(df_tmp))
df_tmp$MDA_rank[order.scores] <- 1:nrow(df_tmp)
df_tmp$MDA_threshold <- ""
df_tmp$MDA_threshold[which(as.integer(df_tmp$MeanDecreaseAccuracy) >= thresholds[5])] <- "TRUE"
measures$MDA <- df_tmp$MeanDecreaseAccuracy
measures$MDA_rank <- df_tmp$MDA_rank
measures$MDA_threshold <- df_tmp$MDA_threshold
# MeanDecreaseGini
df_tmp$MDG_rank <- NA
order.scores <- order(as.numeric(df_tmp$MeanDecreaseGini), row.names(df_tmp))
df_tmp$MDG_rank[order.scores] <- 1:nrow(df_tmp)
df_tmp$MDG_threshold <- ""
df_tmp$MDG_threshold[which(as.integer(df_tmp$MeanDecreaseGini) >= thresholds[6])] <- "TRUE"
measures$MDG <- df_tmp$MeanDecreaseGini
measures$MDG_rank <- df_tmp$MDG_rank
measures$MDG_threshold <- df_tmp$MDG_threshold
write.csv2(measures, outputfilepath, quote=FALSE, row.names = TRUE)
return(measures)
}