Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend frequencytable() function #54

Merged
merged 9 commits into from
Nov 2, 2017
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## Git branching scheme

We follow the scheme described (here)[http://nvie.com/posts/a-successful-git-branching-model/], which can be summarised like this:
We follow the scheme described [here](http://nvie.com/posts/a-successful-git-branching-model/), which can be summarised like this:

- `master`: only stable, released code
- `develop`: latest updates, new features integrated here
Expand Down
70 changes: 48 additions & 22 deletions R/frequencytable.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#' This function generates a matrix of word frequencies by figure or drama.
#' This function generates a matrix of word frequencies by drama, act or scene and/or by figure.
#' @param t The text table, potentially covering multiple texts
#' @param acceptedPOS A list of accepted pos tags
#' @param names Whether to use figure names or ids
#' @param byFigure Wether the count is by figure or by text
#' @param by Whether the count is by drama (default), act or scene
#' @param column The column name we should use (should be either Token.surface or Token.lemma)
#' @param sep The separation character that goes between drama name and figure (if applicable)
#' @param normalize Whether to normalize values or not
Expand All @@ -20,23 +21,31 @@
#' stylo(gui=F, frequencies = stylo_table)
#' }
#' @export
frequencytable <- function(t, acceptedPOS = postags$de$words, names=FALSE, column="Token.surface", byFigure=FALSE, sep="|", normalize=TRUE, sortResult=FALSE) {
frequencytable <- function(t, acceptedPOS = postags$de$words, names=FALSE, column="Token.surface", byFigure=FALSE, sep="|", normalize=FALSE, sortResult=FALSE, by="drama") {
ft <- t
if (length(acceptedPOS) > 0)
ft <- t[t$Token.pos %in% acceptedPOS,]

switch(by,
drama = {
if (byFigure == FALSE) { xt <- stats::xtabs(~drama + ft[,get(column)], data=ft) }
else if (names == TRUE) { xt <- stats::xtabs(~ paste(drama,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft) }
else { xt <- stats::xtabs(~ paste(drama,Speaker.figure_id,sep=sep) + ~ft[,get(column)], data=ft) }
},
act = {
if (byFigure == FALSE) { xt <- stats::xtabs(~ paste(drama,Number.Act,sep=sep) + ~ft[,get(column)], data=ft) }
else if (names == TRUE) { xt <- stats::xtabs(~ paste(drama,Number.Act,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft) }
else { xt <- stats::xtabs(~ paste(drama,Number.Act,Speaker.figure_id,sep=sep) + ~ft[,get(column)], data=ft) }
},
scene = {
if (byFigure == FALSE) { xt <- stats::xtabs(~ paste(drama,Number.Act,Number.Scene,sep=sep) + ~ft[,get(column)], data=ft) }
else if (names == TRUE) { xt <- stats::xtabs(~ paste(drama,Number.Act,Number.Scene,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft) }
else { xt <- stats::xtabs(~ paste(drama,Number.Act,Number.Scene,Speaker.figure_id,sep=sep) + ~ft[,get(column)], data=ft) }
},
stop("Please enter valid string-value for argument 'by' (default = 'drama', 'act' or 'scene').")
)

if (byFigure == FALSE) {
xt <- stats::xtabs(~drama + ft[,get(column)], data=ft)
r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))
} else if (names == TRUE) {
xt <- stats::xtabs(~ paste(drama,Speaker.figure_surface,sep=sep) + ~ft[,get(column)], data=ft)
r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))
} else {
xt <- stats::xtabs(~paste(drama,Speaker.figure_id,sep=sep)+ft[,get(column)], data=ft)
r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))
}

r <- as.matrix(stats::ftable(xt, row.vars = c(), col.vars = c()))

if (normalize == TRUE) {
r <- t(apply(r,1,function(x) { x / sum(x)}))
Expand All @@ -55,18 +64,35 @@ frequencytable <- function(t, acceptedPOS = postags$de$words, names=FALSE, colum
#' @param acceptedPOS A list of accepted pos tags
#' @param names Whether to use figure names or ids
#' @param byFigure Wether the count is by figure or by text
#' @param by Whether the count is by drama (default), act or scene
#' @param cols The column names we should use (should be either Token.surface or Token.lemma)
#' @keywords internal
frequencytable2 <- function(t, acceptedPOS = postags$de$words, names=FALSE, cols=c("Token.surface", "Token.surface"), byFigure=FALSE) {
frequencytable2 <- function(t, acceptedPOS = postags$de$words, names=FALSE, cols=c("Token.surface", "Token.surface"), byFigure=FALSE, by="drama") {
ft <- t
if (length(acceptedPOS) > 0)
if (length(acceptedPOS) > 0) {
ft <- t[t$Token.pos %in% acceptedPOS,]
if (byFigure == FALSE)
index <- paste(ft$drama)
else if (names == TRUE)
index <- paste(ft$drama, ft$Speaker.figure_surface)
else
index <- paste(ft$drama, ft$Speaker.figure_id)
}

switch(by,
drama = {
if (byFigure == FALSE) { index <- paste(ft$drama) }
else if (names == TRUE) { index <- paste(ft$drama, ft$Speaker.figure_surface) }
else { index <- paste(ft$drama, ft$Speaker.figure_id) }
},
act = {
if (byFigure == FALSE) { index <- paste(ft$drama, ft$Number.Act) }
else if (names == TRUE) { index <- paste(ft$drama, ft$Number.Act, ft$Speaker.figure_surface) }
else { index <- paste(ft$drama, ft$Number.Act, ft$Speaker.figure_id) }
},
scene = {
if (byFigure == FALSE) { index <- paste(ft$drama, ft$Number.Act, ft$Number.Scene) }
else if (names == TRUE) { index <- paste(ft$drama, ft$Number.Act, ft$Number.Scene, ft$Speaker.figure_surface) }
else { index <- paste(ft$drama, ft$Number.Act, ft$Number.Scene, ft$Speaker.figure_id) }
},
stop("Please enter valid string-value for argument 'by' (default = 'drama', 'act' or 'scene').")
)

r <- do.call(rbind, tapply(paste(ft[[cols[1]]], ft[[cols[2]]][-1]), index, function(x){prop.table(table(x))}))
r[,order(colSums(r),decreasing=TRUE)]
}

}
2 changes: 1 addition & 1 deletion R/text.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ tfidf1 <- function(word) {
#' @export
#' @examples
#' data(rksp.0)
#' rksp.0.ftable <- frequencytable(rksp.0$mtext,byFigure=TRUE)
#' rksp.0.ftable <- frequencytable(rksp.0$mtext,byFigure=TRUE,normalize=TRUE)
#' rksp.0.tfidf <- tfidf(rksp.0.ftable)
#' @examples
#' mat <- matrix(c(0.10,0.2, 0,
Expand Down
8 changes: 5 additions & 3 deletions man/frequencytable.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/frequencytable2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/tfidf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.