-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordCloudCV.R
70 lines (53 loc) · 1.75 KB
/
wordCloudCV.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# Install
install.packages("tm")
install.packages("SnowballC")
install.packages("wordcloud")
install.packages("RColorBrewer")
# Load
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
# Select your own file
text <- readLines(file.choose())
# Read the text file from internet
filePath <- "<select your own url>"
text <- readLines(filePath)
# Load the data as a corpus
docs <- Corpus(VectorSource(text))
# Inspect the document
inspect(docs)
# Replacing special characters from text
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Building document term matrix
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
# Building word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# Explore frequent terms and their associations
findFreqTerms(dtm, lowfreq = 4)
# The frequency table of words
head(d, 10)
# Plot word frequencies
barplot(d[1:10,]$freq, las = 3, names.arg = d[1:10,]$word,
col ="purple", main ="Most frequent words",
ylab = "Word frequencies")