Skip to content

Commit

Permalink
Merge branch 'develop' into features/frequencytable-41
Browse files Browse the repository at this point in the history
* develop: (61 commits)
  documentation change following 62e9ca5
  dataDirectory now has a default option
  added additional data sources
  fix #53
  better to use "stringsAsFactors" than as.is
  explicitly converting date to string
  no more automatic conversion to factor #53
  added FigureMention type
  ignoring CONTRIBUTING.md
  preventing notes
  added missing parameter
  fix #48
  updated documentation
  Updated function to use collectionDirectory option #44
  new exporter interface
  Updated report to some internal changes
  fix output location of report #34
  Change after adding collection id column
  fix #34, moved report to inst/rmd
  added GDC as a download option
  ...
  • Loading branch information
Nils Reiter committed Oct 21, 2017
2 parents e01d58c + fad6001 commit 63713af
Show file tree
Hide file tree
Showing 31 changed files with 655 additions and 83 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
^Makefile$
^java/\.classpath$
^data-raw$
^CONTRIBUTING.md$
10 changes: 8 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@ export(countAnnotations)
export(dictionaryStatistics)
export(dictionaryStatisticsL)
export(dictionaryStatisticsSingle)
export(dramaHead)
export(dramaTail)
export(figureStatistics)
export(figurematrix)
export(frequencytable)
export(hamming)
export(installCollectionData)
export(installData)
export(keyness)
export(limitFigures)
export(loadAllInstalledIds)
export(loadAnnotations)
Expand All @@ -25,10 +29,12 @@ export(loadText)
export(plotSpiderWebs)
export(plotUtterancePositions)
export(postags)
export(presence)
export(qd.colors)
export(rankFiguresByAppearance)
export(rankFiguresByDramatisPersonae)
export(report)
export(scenicDifference)
export(setup)
export(tfidf)
export(utteranceStatistics)
Expand All @@ -43,19 +49,19 @@ importFrom(graphics,stripchart)
importFrom(httr,HEAD)
importFrom(httr,headers)
importFrom(rJava,.jarray)
importFrom(rJava,.jevalArray)
importFrom(rJava,.jnew)
importFrom(rJava,.jnull)
importFrom(rJava,.jsimplify)
importFrom(readr,locale)
importFrom(readr,read_csv)
importFrom(reshape2,dcast)
importFrom(rmarkdown,render)
importFrom(stats,aggregate)
importFrom(stats,ave)
importFrom(stats,chisq.test)
importFrom(stats,cor)
importFrom(stats,ftable)
importFrom(stats,na.omit)
importFrom(stats,pchisq)
importFrom(stats,reshape)
importFrom(stats,sd)
importFrom(stats,xtabs)
Expand Down
4 changes: 3 additions & 1 deletion R/configuration.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ configuration.scene <- function(text) {
words.per.segment <- t[,.N,.(corpus,drama,Speaker.figure_surface, begin.Scene)]
cfg <- stats::reshape(words.per.segment, direction="wide", idvar = c("corpus","drama","Speaker.figure_surface"), timevar = "begin.Scene")
cfg[is.na(cfg)] <- 0
colnames(cfg)[4:ncol(cfg)] <- seq(1,(ncol(cfg)-3))
cfg <- cfg[order(as.character(cfg$Speaker.figure_surface)),]
if (length(4:ncol(cfg)) > 0)
colnames(cfg)[4:ncol(cfg)] <- seq(1,length(4:ncol(cfg)))
list(matrix=as.matrix(cfg[,4:ncol(cfg)]),drama=cfg[,1:2],figure=as.character(cfg[[3]]))
}

Expand Down
6 changes: 4 additions & 2 deletions R/dictionaryStatistics.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
#' @description This function loads the word fields that are available on the web.
#' @param fieldnames A list of names for the dictionaries. It is expected that files with that name can be found below the URL.
#' @param baseurl The base path delivering the dictionaries. Should end in a /, field names will be appended and fed into read.csv().
#' @param fileSuffix The suffix for the dictionary files
#' @importFrom utils read.csv
#' @export
loadFields <- function(fieldnames=c(),
baseurl="https://raw.githubusercontent.com/quadrama/metadata/master/fields/") {
baseurl="https://raw.githubusercontent.com/quadrama/metadata/master/fields/",
fileSuffix=".txt") {
r <- list()
for (field in fieldnames) {
url <- paste(baseurl, field, ".txt", sep="")
url <- paste(baseurl, field, fileSuffix, sep="")
r[[field]] <- as.character((read.csv(url, header=F, fileEncoding = "UTF-8"))$V1)
}
r
Expand Down
2 changes: 1 addition & 1 deletion R/installData.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ installCollectionData <- function(dataDirectory=getOption("qd.datadir"),

# move new dir into place
file.rename(from=file.path(repoDirectory,"collections"),
to=file.path(dataDirectory,"collections"))
to=getOption("qd.collectionDirectory"))

# remove repo directory
unlink(repoDirectory,recursive = TRUE)
Expand Down
107 changes: 107 additions & 0 deletions R/keyness.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#' @title Keywords
#' @description Given a frequency table (with texts as rows and words as columns),
#' this function calculates log-likelihood and log ratio of one set of rows against the other rows.
#' The return value is a list containing scores for each word
#' @param ft The frequency table
#' @param row The row number we want to compare to the others, can be a vector of row numbers
#' @param epsilon null values are replaced by this value, in order to avoid division by zero
#' @param siglevel Return only the keywords above the significance level. Set to 1 to get all words
#' @param method Either "logratio" or "loglikelihood" (default)
#' @param minimalFrequency Words less frequent than this value are not considered at all
#' @return A list of keywords, sorted by their log-likelihood value, calculated according to http://ucrel.lancs.ac.uk/llwizard.html
#' @export
#' @importFrom stats pchisq chisq.test
#' @examples
#' data("rksp.0")
#' ft <- frequencytable(rksp.0$mtext,byFigure = TRUE,names=TRUE,normalize = FALSE)
#' # Calculate log ratio for all words
#' keywords <- keyness(ft, method="logratio", row=7, minimalFrequency = 5)
#' # Remove words that are not significantly different
#' keywords <- keywords[names(keywords) %in% names(keyness(ft, row=1,siglevel=0.01))]
#'
keyness <- function(ft, row=1, epsilon=1e-100,siglevel=0.05,method=c("loglikelihood","logratio"),minimalFrequency=10) {
f1 <- colSums(matrix(ft[row,],nrow=length(row),dimnames=list(NULL,colnames(ft))))
f2 <- colSums(matrix(ft[-1*row,],nrow=nrow(ft)-length(row),dimnames=list(NULL,colnames(ft))))

type <- match.arg(method)
switch(type,
loglikelihood=keyness.ll(f1, f2, minimalFrequency, epsilon, siglevel),
logratio=keyness.logratio(f1,f2,minimalFrequency))

}

keyness.ll <- function(f1, f2, minimalFrequency, epsilon,siglevel) {

total1 <- sum(f1)
total2 <- sum(f2)

f1[f1==0] <- epsilon
f2[f2==0] <- epsilon
other1 <- total1 - f1
other2 <- total2 - f2
#print(paste("total1",total1))
#print(paste("total2",total2))

rf <- (f1 + f2) / ( total1 + total2 )

e1 <- total1 * rf
e2 <- total2 * rf

l <- 2 * ( ( f1 * log(f1/e1) ) + (f2 * log(f2/e2) ))


l <- sort(l,decreasing = TRUE)
pvalues <- 1-stats::pchisq(l, df=1)

l[pvalues<siglevel]
}

keyness.logratio <- function(f1, f2, minimalFrequency) {
f1[f1<=minimalFrequency] <- 0
f2[f2<=minimalFrequency] <- 0

rf1 <- f1 / sum(f1)
rf2 <- f2 / sum(f2)

r <- sort(log2(rf1/rf2),decreasing = TRUE)

r[is.finite(r)]
}

# R function for calculating LL, by Andrew Hardie, Lancaster University.
# (with apologies for the inevitable R-n00b blunders)

loglikelihood.test = function(O)
{
DNAME <- deparse(substitute(O))

E = suppressWarnings(chisq.test(O)$expected)

sum = 0;

for(i in 1:length(O[,1]))
{
for(j in 1:length(O[1,]))
{
if (O[i,j] == 0 || E[i,j] == 0)
next
sum = sum + (O[i,j] * log(O[i,j]/E[i,j]))
}
}
STAT = sum * 2;

DF = (length(O[1,]) - 1) * (length(O[,1]) - 1)

P = 1 - pchisq(STAT, df=DF)

names(DF) = "df"
names(STAT) = "Log-likelihood"

obj = list(statistic=STAT, parameter=DF, p.value=P, method="Log-Likelihood test",
data.name=DNAME, observed=O, expected=E)

attr(obj, "class") <- "htest"

return (obj)
}

Loading

0 comments on commit 63713af

Please sign in to comment.