-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_features.R
119 lines (109 loc) · 6.78 KB
/
get_features.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
library(stringr)
get_features <- function(df,
clean_titles,
freq_words_p,
freq_words_non_p,
freq_words_all,
p50_words,
non_p50_words,
p_next100,
non_p_next100,
antique_names
) {
#message(paste0("freq_words_p: ", is.null(freq_words_p)))
# Prepare the word lists, so that the words will be exact matches
#freq_words_p <- paste0("(^| )", freq_words_p, "( |$)")
#freq_words_non_p <- paste0("(^| )", freq_words_non_p, "( |$)")
#freq_words_all <- paste0("(^| )", freq_words_all, "( |$)")
#p50_words <- paste0("(^| )", p50_words, "( |$)")
#non_p50_words <- paste0("(^| )", non_p50_words, "( |$)")
#p_next100 <- paste0("(^| )", p_next100, "( |$)")
#non_p_next100 <- paste0("(^| )", non_p_next100, "( |$)")
#antique_names <- paste0("(^| )", antique_names, "( |$)")
# Start preparing the features
no_of_chars <- nchar(as.character(df$whole_title_sans_edition))
no_of_words <- str_count(as.character(df$whole_title_sans_edition), " ") + 1
no_of_verbs <- str_count(df$tagged, "/VB")
no_of_interjections <- str_count(df$tagged, "/UH")
no_of_adjectives <- str_count(df$tagged, "/JJ")
no_of_adverbs <- str_count(df$tagged, "/RB")
no_of_foreign_words <- str_count(df$tagged, "/FW")
no_of_proper_nouns <- str_count(df$tagged, "/NNP")
no_of_pronouns <- str_count(df$tagged, "/PRP")
no_of_gerunds <- str_count(df$tagged, "/VBG")
no_of_verbs_past <- str_count(df$tagged, "/VBD|/VBN")
no_of_chars_title_only <- nchar(as.character(df$title))
no_of_words_title_only <- str_count(as.character(df$title), " ") + 1
len_words <- str_count(as.character(df$whole_title), " ") / nchar(as.character(df$whole_title_sans_edition))
no_of_chars_remainder <- nchar(as.character(df$title_remainder))
no_of_words_remainder <- str_count(as.character(df$title_remainder), " ") + 1
#no_of_poetry_words <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, freq_words_p))}))
#no_of_non_poetry_words <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, freq_words_non_p))}))
#no_of_common_words <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, freq_words_all))}))
#no_of_poetry_words_share50 <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, p50_words))}))
#no_of_poetry_words_share100 <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, p_next100))}))
#no_of_non_poetry_words_share50 <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, non_p50_words))}))
#no_of_non_poetry_words_share100 <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, non_p_next100))}))
#no_of_antique_names <- unlist(lapply(X = clean_titles, FUN = function(t) {sum(str_count(t, antique_names))}))
no_of_question_marks <- str_count(as.character(df$whole_title_sans_edition), "[?]")
no_of_exclamation_marks <- str_count(as.character(df$whole_title_sans_edition), "[!]")
no_of_sentences <- str_count(as.character(df$whole_title_sans_edition), "[?!.]")
no_of_commas <- str_count(as.character(df$whole_title_sans_edition), "[,]")
no_of_poetry_words <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% freq_words_p))}))
no_of_non_poetry_words <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% freq_words_non_p))}))
no_of_common_words <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% freq_words_all))}))
no_of_poetry_words_share50 <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% p50_words))}))
no_of_poetry_words_share100 <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% p_next100))}))
no_of_non_poetry_words_share50 <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% non_p50_words))}))
no_of_non_poetry_words_share100 <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% non_p_next100))}))
no_of_antique_names <- unlist(lapply(X = clean_titles, FUN = function(t) {
length(which(unlist(str_split(t, " ")) %in% antique_names))}))
author_age <- as.integer(df$publication_year_from) - as.integer(df$author_birth)
pagecount <- df$pagecount
physical_extent <- df$gatherings
publication_year <- df$publication_year_from
# Fix NAs: Which is a correct way to do this?
author_age[which(is.na(author_age))] <- median.default(author_age, na.rm=TRUE)
pagecount[which(is.na(pagecount))] <- median.default(pagecount, na.rm=TRUE)
publication_year[which(is.na(publication_year))] <- median.default(publication_year, na.rm=TRUE)
return (data.frame(no_of_chars=no_of_chars,
no_of_words=no_of_words,
no_of_verbs=no_of_verbs,
no_of_interjections=no_of_interjections,
no_of_adjectives=no_of_adjectives,
no_of_adverbs=no_of_adverbs,
no_of_foreign_words=no_of_foreign_words,
no_of_proper_nouns=no_of_proper_nouns,
no_of_pronouns=no_of_pronouns,
no_of_gerunds=no_of_gerunds,
no_of_verbs_past=no_of_verbs_past,
no_of_chars_title_only=no_of_chars_title_only,
no_of_words_title_only=no_of_words_title_only,
len_words=len_words,
no_of_chars_remainder=no_of_chars_remainder,
no_of_words_remainder=no_of_words_remainder,
no_of_question_marks=no_of_question_marks,
no_of_exclamation_marks=no_of_exclamation_marks,
no_of_sentences=no_of_sentences,
no_of_commas=no_of_commas,
no_of_poetry_words=no_of_poetry_words,
no_of_non_poetry_words=no_of_non_poetry_words,
no_of_common_words=no_of_common_words,
no_of_poetry_words_share50=no_of_poetry_words_share50,
no_of_non_poetry_words_share50=no_of_non_poetry_words_share50,
no_of_poetry_words_share100=no_of_poetry_words_share100,
no_of_non_poetry_words_share100=no_of_non_poetry_words_share100,
no_of_antique_names=no_of_antique_names,
author_age=author_age,
pagecount=pagecount,
physical_extent=physical_extent,
publication_year=publication_year
))
}