Merge branch 'develop' into features/frequencytable-41

* develop: (61 commits) documentation change following 62e9ca5 dataDirectory now has a default option added additional data sources fix #53 better to use "stringsAsFactors" than as.is explicitly converting date to string no more automatic conversion to factor #53 added FigureMention type ignoring CONTRIBUTING.md preventing notes added missing parameter fix #48 updated documentation Updated function to use collectionDirectory option #44 new exporter interface Updated report to some internal changes fix output location of report #34 Change after adding collection id column fix #34, moved report to inst/rmd added GDC as a download option ...
quadrama · Oct 21, 2017 · 63713af · 63713af
2 parents e01d58c + fad6001
commit 63713af
Show file tree

Hide file tree

Showing 31 changed files with 655 additions and 83 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -6,3 +6,4 @@
 ^Makefile$
 ^java/\.classpath$
 ^data-raw$
+^CONTRIBUTING.md$
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,11 +7,15 @@ export(countAnnotations)
 export(dictionaryStatistics)
 export(dictionaryStatisticsL)
 export(dictionaryStatisticsSingle)
+export(dramaHead)
+export(dramaTail)
 export(figureStatistics)
 export(figurematrix)
 export(frequencytable)
+export(hamming)
 export(installCollectionData)
 export(installData)
+export(keyness)
 export(limitFigures)
 export(loadAllInstalledIds)
 export(loadAnnotations)
@@ -25,10 +29,12 @@ export(loadText)
 export(plotSpiderWebs)
 export(plotUtterancePositions)
 export(postags)
+export(presence)
 export(qd.colors)
 export(rankFiguresByAppearance)
 export(rankFiguresByDramatisPersonae)
 export(report)
+export(scenicDifference)
 export(setup)
 export(tfidf)
 export(utteranceStatistics)
@@ -43,19 +49,19 @@ importFrom(graphics,stripchart)
 importFrom(httr,HEAD)
 importFrom(httr,headers)
 importFrom(rJava,.jarray)
-importFrom(rJava,.jevalArray)
 importFrom(rJava,.jnew)
 importFrom(rJava,.jnull)
-importFrom(rJava,.jsimplify)
 importFrom(readr,locale)
 importFrom(readr,read_csv)
 importFrom(reshape2,dcast)
 importFrom(rmarkdown,render)
 importFrom(stats,aggregate)
 importFrom(stats,ave)
+importFrom(stats,chisq.test)
 importFrom(stats,cor)
 importFrom(stats,ftable)
 importFrom(stats,na.omit)
+importFrom(stats,pchisq)
 importFrom(stats,reshape)
 importFrom(stats,sd)
 importFrom(stats,xtabs)

diff --git a/R/configuration.R b/R/configuration.R
@@ -55,7 +55,9 @@ configuration.scene <- function(text) {
   words.per.segment <- t[,.N,.(corpus,drama,Speaker.figure_surface, begin.Scene)]
   cfg <- stats::reshape(words.per.segment, direction="wide", idvar = c("corpus","drama","Speaker.figure_surface"), timevar = "begin.Scene")
   cfg[is.na(cfg)] <- 0
-  colnames(cfg)[4:ncol(cfg)] <- seq(1,(ncol(cfg)-3))
+  cfg <- cfg[order(as.character(cfg$Speaker.figure_surface)),]
+  if (length(4:ncol(cfg)) > 0)
+    colnames(cfg)[4:ncol(cfg)] <- seq(1,length(4:ncol(cfg)))
   list(matrix=as.matrix(cfg[,4:ncol(cfg)]),drama=cfg[,1:2],figure=as.character(cfg[[3]]))
 }
 

diff --git a/R/dictionaryStatistics.R b/R/dictionaryStatistics.R
@@ -2,13 +2,15 @@
 #' @description This function loads the word fields that are available on the web.
 #' @param fieldnames A list of names for the dictionaries. It is expected that files with that name can be found below the URL.
 #' @param baseurl The base path delivering the dictionaries. Should end in a /, field names will be appended and fed into read.csv().
+#' @param fileSuffix The suffix for the dictionary files
 #' @importFrom utils read.csv
 #' @export
 loadFields <- function(fieldnames=c(),
-                      baseurl="https://raw.githubusercontent.com/quadrama/metadata/master/fields/") {
+                      baseurl="https://raw.githubusercontent.com/quadrama/metadata/master/fields/",
+                      fileSuffix=".txt") {
   r <- list()
   for (field in fieldnames) {
-    url <- paste(baseurl, field, ".txt", sep="")
+    url <- paste(baseurl, field, fileSuffix, sep="")
     r[[field]] <- as.character((read.csv(url, header=F, fileEncoding = "UTF-8"))$V1)
   }
   r

diff --git a/R/installData.R b/R/installData.R
@@ -31,7 +31,7 @@ installCollectionData <- function(dataDirectory=getOption("qd.datadir"),
 
   # move new dir into place
   file.rename(from=file.path(repoDirectory,"collections"),
-              to=file.path(dataDirectory,"collections"))
+              to=getOption("qd.collectionDirectory"))
 
   # remove repo directory
   unlink(repoDirectory,recursive = TRUE)

diff --git a/R/keyness.R b/R/keyness.R
@@ -0,0 +1,107 @@
+#' @title Keywords
+#' @description  Given a frequency table (with texts as rows and words as columns),
+#' this function calculates log-likelihood and log ratio of one set of rows against the other rows. 
+#' The return value is a list containing scores for each word
+#' @param ft The frequency table
+#' @param row The row number we want to compare to the others, can be a vector of row numbers
+#' @param epsilon null values are replaced by this value, in order to avoid division by zero
+#' @param siglevel Return only the keywords above the significance level. Set to 1 to get all words
+#' @param method Either "logratio" or "loglikelihood" (default)
+#' @param minimalFrequency Words less frequent than this value are not considered at all
+#' @return A list of keywords, sorted by their log-likelihood value, calculated according to http://ucrel.lancs.ac.uk/llwizard.html
+#' @export
+#' @importFrom stats pchisq chisq.test
+#' @examples 
+#' data("rksp.0")
+#' ft <- frequencytable(rksp.0$mtext,byFigure = TRUE,names=TRUE,normalize = FALSE)
+#' # Calculate log ratio for all words
+#' keywords <- keyness(ft, method="logratio", row=7, minimalFrequency = 5)
+#' # Remove words that are not significantly different
+#' keywords <- keywords[names(keywords) %in% names(keyness(ft, row=1,siglevel=0.01))]
+#' 
+keyness <- function(ft, row=1, epsilon=1e-100,siglevel=0.05,method=c("loglikelihood","logratio"),minimalFrequency=10) {
+  f1 <- colSums(matrix(ft[row,],nrow=length(row),dimnames=list(NULL,colnames(ft))))
+  f2 <- colSums(matrix(ft[-1*row,],nrow=nrow(ft)-length(row),dimnames=list(NULL,colnames(ft))))
+
+  type <- match.arg(method)
+  switch(type,
+         loglikelihood=keyness.ll(f1, f2, minimalFrequency, epsilon, siglevel),
+         logratio=keyness.logratio(f1,f2,minimalFrequency))
+
+}
+
+keyness.ll <- function(f1, f2, minimalFrequency, epsilon,siglevel) {
+
+  total1 <- sum(f1)
+  total2 <- sum(f2)
+
+  f1[f1==0] <- epsilon
+  f2[f2==0] <- epsilon
+  other1 <- total1 - f1
+  other2 <- total2 - f2
+  #print(paste("total1",total1))
+  #print(paste("total2",total2))
+
+  rf <- (f1 + f2) / ( total1 + total2 )
+
+  e1 <- total1 * rf
+  e2 <- total2 * rf
+
+  l <- 2 * ( ( f1 * log(f1/e1)  ) + (f2 * log(f2/e2)  ))
+
+
+  l <- sort(l,decreasing = TRUE)
+  pvalues <- 1-stats::pchisq(l, df=1)
+
+  l[pvalues<siglevel]
+}
+
+keyness.logratio <- function(f1, f2, minimalFrequency) {
+  f1[f1<=minimalFrequency] <- 0
+  f2[f2<=minimalFrequency] <- 0
+
+  rf1 <- f1 / sum(f1)
+  rf2 <- f2 / sum(f2)
+
+  r <- sort(log2(rf1/rf2),decreasing = TRUE)
+
+  r[is.finite(r)]
+}
+
+# R function for calculating LL, by Andrew Hardie, Lancaster University.
+# (with apologies for the inevitable R-n00b blunders)
+
+loglikelihood.test = function(O)
+{
+  DNAME <- deparse(substitute(O))
+
+  E = suppressWarnings(chisq.test(O)$expected)
+
+  sum = 0;
+
+  for(i in 1:length(O[,1]))
+  {
+    for(j in 1:length(O[1,]))
+    {
+      if (O[i,j] == 0 || E[i,j] == 0)
+        next
+      sum = sum + (O[i,j] * log(O[i,j]/E[i,j]))
+    }
+  }
+  STAT = sum * 2;
+
+  DF = (length(O[1,]) - 1) * (length(O[,1]) - 1)
+
+  P = 1 - pchisq(STAT, df=DF)
+
+  names(DF) = "df"
+  names(STAT) = "Log-likelihood"
+
+  obj =  list(statistic=STAT, parameter=DF, p.value=P, method="Log-Likelihood test", 
+              data.name=DNAME, observed=O, expected=E)
+
+  attr(obj, "class") <- "htest"
+
+  return (obj)
+}
+