Skip to content

Commit

Permalink
Merge pull request #54 from quadrama/DramaAnalysis-Tim
Browse files Browse the repository at this point in the history
Extend frequencytable() function
  • Loading branch information
nilsreiter authored Nov 2, 2017
2 parents 97224a6 + 7ab3e62 commit e3a66f7
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 28 deletions.
70 changes: 48 additions & 22 deletions R/frequencytable.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#' This function generates a matrix of word frequencies by figure or drama.
#' This function generates a matrix of word frequencies by drama, act or scene and/or by figure.
#' @param t The text table, potentially covering multiple texts
#' @param acceptedPOS A list of accepted pos tags
#' @param names Whether to use figure names or ids
#' @param byFigure Wether the count is by figure or by text
#' @param by Whether the count is by drama (default), act or scene
#' @param column The column name we should use (should be either Token.surface or Token.lemma)
#' @param sep The separation character that goes between drama name and figure (if applicable)
#' @param normalize Whether to normalize values or not
Expand All @@ -20,23 +21,31 @@
#' stylo(gui=F, frequencies = stylo_table)
#' }
#' @export
frequencytable <- function(t, acceptedPOS = postags$de$words, names=FALSE, column="Token.surface", byFigure=FALSE, sep="|", normalize=TRUE, sortResult=FALSE) {
frequencytable <- function(t, acceptedPOS = postags$de$words, names=FALSE, column="Token.surface", byFigure=FALSE, sep="|", normalize=FALSE, sortResult=FALSE, by="drama") {
ft <- t
if (length(acceptedPOS) > 0)
ft <- t[t$Token.pos %in% acceptedPOS,]

switch(by,
drama = {
if (byFigure == FALSE) { xt <- stats::xtabs(~drama + ft[,get(column)], data=ft) }
else if (names == TRUE) { xt <- stats::xtabs(~ paste(drama,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft) }
else { xt <- stats::xtabs(~ paste(drama,Speaker.figure_id,sep=sep) + ~ft[,get(column)], data=ft) }
},
act = {
if (byFigure == FALSE) { xt <- stats::xtabs(~ paste(drama,Number.Act,sep=sep) + ~ft[,get(column)], data=ft) }
else if (names == TRUE) { xt <- stats::xtabs(~ paste(drama,Number.Act,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft) }
else { xt <- stats::xtabs(~ paste(drama,Number.Act,Speaker.figure_id,sep=sep) + ~ft[,get(column)], data=ft) }
},
scene = {
if (byFigure == FALSE) { xt <- stats::xtabs(~ paste(drama,Number.Act,Number.Scene,sep=sep) + ~ft[,get(column)], data=ft) }
else if (names == TRUE) { xt <- stats::xtabs(~ paste(drama,Number.Act,Number.Scene,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft) }
else { xt <- stats::xtabs(~ paste(drama,Number.Act,Number.Scene,Speaker.figure_id,sep=sep) + ~ft[,get(column)], data=ft) }
},
stop("Please enter valid string-value for argument 'by' (default = 'drama', 'act' or 'scene').")
)

if (byFigure == FALSE) {
xt <- stats::xtabs(~drama + ft[,get(column)], data=ft)
r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))
} else if (names == TRUE) {
xt <- stats::xtabs(~ paste(drama,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft)
r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))
} else {
xt <- stats::xtabs(~paste(drama,Speaker.figure_id,sep=sep)+ft[,get(column)], data=ft)
r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))
}

r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))

if (normalize == TRUE) {
r <- t(apply(r,1,function(x) { x / sum(x)}))
Expand All @@ -55,18 +64,35 @@ frequencytable <- function(t, acceptedPOS = postags$de$words, names=FALSE, colum
#' @param acceptedPOS A list of accepted pos tags
#' @param names Whether to use figure names or ids
#' @param byFigure Wether the count is by figure or by text
#' @param by Whether the count is by drama (default), act or scene
#' @param cols The column names we should use (should be either Token.surface or Token.lemma)
#' @keywords internal
frequencytable2 <- function(t, acceptedPOS = postags$de$words, names=FALSE, cols=c("Token.surface", "Token.surface"), byFigure=FALSE) {
frequencytable2 <- function(t, acceptedPOS = postags$de$words, names=FALSE, cols=c("Token.surface", "Token.surface"), byFigure=FALSE, by="drama") {
ft <- t
if (length(acceptedPOS) > 0)
if (length(acceptedPOS) > 0) {
ft <- t[t$Token.pos %in% acceptedPOS,]
if (byFigure == FALSE)
index <- paste(ft$drama)
else if (names == TRUE)
index <- paste(ft$drama, ft$Speaker.figure_surface)
else
index <- paste(ft$drama, ft$Speaker.figure_id)
}

switch(by,
drama = {
if (byFigure == FALSE) { index <- paste(ft$drama) }
else if (names == TRUE) { index <- paste(ft$drama, ft$Speaker.figure_surface) }
else { index <- paste(ft$drama, ft$Speaker.figure_id) }
},
act = {
if (byFigure == FALSE) { index <- paste(ft$drama, ft$Number.Act) }
else if (names == TRUE) { index <- paste(ft$drama, ft$Number.Act, ft$Speaker.figure_surface) }
else { index <- paste(ft$drama, ft$Number.Act, ft$Speaker.figure_id) }
},
scene = {
if (byFigure == FALSE) { index <- paste(ft$drama, ft$Number.Act, ft$Number.Scene) }
else if (names == TRUE) { index <- paste(ft$drama, ft$Number.Act, ft$Number.Scene, ft$Speaker.figure_surface) }
else { index <- paste(ft$drama, ft$Number.Act, ft$Number.Scene, ft$Speaker.figure_id) }
},
stop("Please enter valid string-value for argument 'by' (default = 'drama', 'act' or 'scene').")
)

r <- do.call(rbind, tapply(paste(ft[[cols[1]]], ft[[cols[2]]][-1]), index, function(x){prop.table(table(x))}))
r[,order(colSums(r),decreasing=TRUE)]
}

}
2 changes: 1 addition & 1 deletion R/text.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ tfidf1 <- function(word) {
#' @export
#' @examples
#' data(rksp.0)
#' rksp.0.ftable <- frequencytable(rksp.0$mtext,byFigure=TRUE)
#' rksp.0.ftable <- frequencytable(rksp.0$mtext,byFigure=TRUE,normalize=TRUE)
#' rksp.0.tfidf <- tfidf(rksp.0.ftable)
#' @examples
#' mat <- matrix(c(0.10,0.2, 0,
Expand Down
8 changes: 5 additions & 3 deletions man/frequencytable.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/frequencytable2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/tfidf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e3a66f7

Please sign in to comment.