-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathLDAprep.R
67 lines (65 loc) · 3.09 KB
/
LDAprep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#' Create Lda-ready Dataset
#'
#' This function transforms a text corpus such as the result of
#' \code{\link{cleanTexts}} into the form needed by the \code{\link{lda}}-package.
#'
#'
#' @param text A list of tokenized texts
#' @param vocab A character vector containing all words which should beused for
#' lda
#' @param reduce Logical: Should empty texts be deleted?
#' @return A list in which every entry contains a matrix with two rows: The
#' first row gives the number of the entry of the word in \code{vocab} minus
#' one, the second row is 1 and the number of the
#' occurrence of the word will be shown by the number of columns belonging to
#' this word.
#' @keywords manip
#' @examples
#' texts <- list(A="Give a Man a Fish, and You Feed Him for a Day.
#' Teach a Man To Fish, and You Feed Him for a Lifetime",
#' B="So Long, and Thanks for All the Fish",
#' C="A very able manipulative mathematician, Fisher enjoys a real mastery
#' in evaluating complicated multiple integrals.")
#'
#' corpus <- textmeta(meta=data.frame(id=c("A", "B", "C", "D"),
#' title=c("Fishing", "Don't panic!", "Sir Ronald", "Berlin"),
#' date=c("1885-01-02", "1979-03-04", "1951-05-06", "1967-06-02"),
#' additionalVariable=1:4, stringsAsFactors=FALSE), text=texts)
#'
#' corpus <- cleanTexts(corpus)
#' wordlist <- makeWordlist(corpus$text)
#' LDAprep(text=corpus$text, vocab=wordlist$words, reduce = TRUE)
#'
#' @export LDAprep
#' @import data.table
LDAprep <- function(text, vocab,
reduce = TRUE){
stopifnot(is.textmeta(textmeta(text = text)), is.character(vocab),
is.logical(reduce), length(reduce) == 1)
text <- lapply(text, unlist)
. <- NULL # for cran check
id <- NULL # for cran check
vtable <- data.table::data.table(word = vocab, id = seq_along(vocab) - 1L, key = "word")
text <- lapply(text,
function(words) rbind(vtable[.(words), "id", nomatch = 0L][order(id)]$id, 1L))
# ltable = rbindlist(mapply(
# function(article, words) {
# data.table(word = words, article = article)[, list(article = article, N = .N), by = "word"]
# }, article = names(text), words = text, SIMPLIFY = FALSE))
#
# ltable = merge(ltable, vtable, all.x = FALSE, all.y = FALSE, on = "word")
# rm(vtable)
# res = ltable[, list(x = list(rbind(id, 1))), by = article]
# res = setNames(x$x, x$article)
#})
# x(split(tmp$id, tmp$article)
#
if(reduce){ # delete entries where dimension is not computable
tmp <- lengths(lapply(text, dim)) == 0
if (any(tmp)) text <- text[!tmp]
Dim <- sapply(text, dim)
text <- text[Dim[2,] != 0]
text <- text[Dim[1,] != 1]
}
return(text)
}