-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path4_data_clean.R
38 lines (25 loc) · 1.42 KB
/
4_data_clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
## 数据清洗
table(is.na(num_train))
table(is.na(num_test))
table(is.na(cat_train))
table(is.na(cat_test))
# 过滤高度相关的变量
library(caret)
x <- cor(num_train)
ax <- findCorrelation(x, cutoff=0.7)
num_train <- num_train[,-ax,with=FALSE]
num_test <- num_test[,weeks_worked_in_year := NULL]
#检查类别型部分每一列数据下的遗漏情况
mvtr <- sapply(cat_train, function(x){sum(is.na(x))/length(x)}*100)
mvte <- sapply(cat_test, function(x){sum(is.na(x)/length(x))}*100)
#然后我们发现有的列甚至有超过50%的数据遗漏,这有可能是由于采集数据难度所致
#将遗漏率小于5%的列挑选出来
cat_train <- subset(cat_train, select = mvtr < 5 )
cat_test <- subset(cat_test, select = mvte < 5)
#对于cat_train与cat_test中剩下的遗漏值,比较好的办法是将其标记为“Unavailable”
cat_train <- cat_train[,names(cat_train) := lapply(.SD, as.character),.SDcols = names(cat_train)]
for (i in seq_along(cat_train)) set(cat_train, i=which(is.na(cat_train[[i]])), j=i, value="Unavailable")
cat_train <- cat_train[, names(cat_train) := lapply(.SD, factor), .SDcols = names(cat_train)]
cat_test <- cat_test[, (names(cat_test)) := lapply(.SD, as.character), .SDcols = names(cat_test)]
for (i in seq_along(cat_test)) set(cat_test, i=which(is.na(cat_test[[i]])), j=i, value="Unavailable")
cat_test <- cat_test[, (names(cat_test)) := lapply(.SD, factor), .SDcols = names(cat_test)]