pass optional encodings to handle non-latin text (closes #10)

ropensci · May 31, 2016 · 71c3b82 · 71c3b82
1 parent f86cc5c
commit 71c3b82
Show file tree

Hide file tree

Showing 10 changed files with 51 additions and 14 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: tabulizer
 Type: Package
 Title: Bindings for Tabula PDF Table Extractor Library
-Version: 0.1.14
+Version: 0.1.15
 Date: 2016-05-31
 Authors@R: c(person("Thomas J.", "Leeper", role = c("aut", "cre"),
                     email = "[email protected]"),

diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+# CHANGES TO tabulizer 0.1.15 #
+
+* Added better support for specifying character encoding. (#10)
+
 # CHANGES TO tabulizer 0.1.14 #
 
 * Added support for password-protected PDF files. (#11)

diff --git a/R/extract_tables.R b/R/extract_tables.R
@@ -8,6 +8,7 @@
 #' @param spreadsheet A logical indicating whether to use Tabula's spreadsheet extraction algorithm. If \code{NULL} (the default), an automated assessment is made about whether it is appropriate.
 #' @param method A function to coerce the Java response object (a Java ArrayList of Tabula Tables) to some output format. The default method, \dQuote{matrices}, returns a list of character matrices. See Details for other options.
 #' @param password Optionally, a character string containing a user password to access a secured PDF.
+#' @param encoding Optionally, a character string specifying an encoding for the text, to be passed to the assignment method of \code{\link[base]{Encoding}}.
 #' @param \dots These are additional arguments passed to the internal functions dispatched by \code{method}.
 #' @details This function mimics the behavior of the Tabula command line utility. It returns a list of R character matrices containing tables extracted from a file by default. This response behavior can be changed by using the following options. \code{method = "character"} returns a list of single-element character vectors, where each vector is a tab-delimited, line-separate string of concatenated table cells. \code{method = "data.frame"} attempts to coerce the structure returned by \code{method = "character"} into a list of data.frames and returns character strings where this fails. \code{method = "csv"} writes the tables to comma-separated (CSV) files using Tabula's CSVWriter method in the same directory as the original PDF. \code{method = "tsv"} does the same but with tab-separated (TSV) files using Tabula's TSVWriter and \code{method = "json"} does the same using Tabula's JSONWriter method. The previous three methods all return the path to the directory containing the extract table files. \code{method = "asis"} returns the Java object reference, which can be useful for debugging or for writing a custom parser.
 #' \code{\link{extract_areas}} implements this functionality in an interactive mode allowing the user to specify extraction areas for each page.
@@ -49,6 +50,7 @@ function(file,
          spreadsheet = NULL,
          method = "matrix",
          password = NULL,
+         encoding = NULL,
          ...) {
 
     pdfDocument <- load_doc(file, password = password)
@@ -115,9 +117,9 @@ function(file,
            "csv" = write_csvs(tables, file = file, ...),
            "tsv" = write_tsvs(tables, file = file, ...),
            "json" = write_jsons(tables, file = file, ...),
-           "character" = list_characters(tables, ...),
-           "matrix" = list_matrices(tables, ...),
-           "data.frame" = list_data_frames(tables, ...),
+           "character" = list_characters(tables, encoding = encoding, ...),
+           "matrix" = list_matrices(tables, encoding = encoding, ...),
+           "data.frame" = list_data_frames(tables, encoding = encoding, ...),
            "asis" = tables,
            tables)
 }
diff --git a/R/extract_text.R b/R/extract_text.R
@@ -3,6 +3,7 @@
 #' @param file A character string specifying the path or URL to a PDF file.
 #' @param pages An optional integer vector specifying pages to extract from.
 #' @param password Optionally, a character string containing a user password to access a secured PDF.
+#' @param encoding Optionally, a character string specifying an encoding for the text, to be passed to the assignment method of \code{\link[base]{Encoding}}.
 #' @details This function converts the contents of a PDF file into a single unstructured character string.
 #' @return If \code{pages = NULL} (the default), a length 1 character vector, otherwise a vector of length \code{length(pages)}.
 #' @author Thomas J. Leeper <[email protected]>
@@ -20,7 +21,7 @@
 #' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{split_pdf}}
 #' @importFrom rJava J new
 #' @export
-extract_text <- function(file, pages = NULL, password = NULL) {
+extract_text <- function(file, pages = NULL, password = NULL, encoding = NULL) {
     pdfDocument <- load_doc(file, password = password)
     on.exit(pdfDocument$close())
 
@@ -36,5 +37,8 @@ extract_text <- function(file, pages = NULL, password = NULL) {
     } else {
         out <- stripper$getText(pdfDocument)
     }
+    if (!is.null(encoding)) {
+        Encoding(out) <- encoding
+    }
     out
 }
diff --git a/R/output.R b/R/output.R
@@ -49,7 +49,7 @@ write_jsons <- function(tables, file, ...) {
     normalizePath(dirname(file))
 }
 
-list_matrices <- function(tables, ...) {
+list_matrices <- function(tables, encoding = NULL, ...) {
     out <- list()
     n <- 1L
     tablesIterator <- tables$iterator()
@@ -63,21 +63,24 @@ list_matrices <- function(tables, ...) {
                 out[[n]][i, j] <- tab$getCell(i-1L, j-1L)$getText()
             }
         }
+        if (!is.null(encoding)) {
+            Encoding(out[[n]]) <- encoding
+        }
         rm(tab)
         n <- n + 1L
     }
     out
 }
 
-list_characters <- function(tables, sep = "\t", ...) {
-    m <- list_matrices(tables, ...)
+list_characters <- function(tables, sep = "\t", encoding = NULL, ...) {
+    m <- list_matrices(tables, encoding = encoding, ...)
     lapply(m, function(x) {
         paste0(apply(x, 1, paste, collapse = sep), collapse = "\n")
     })
 }
 
-list_data_frames <- function(tables, sep = "\t", stringsAsFactors = FALSE, ...) {
-    char <- list_characters(tables = tables, sep = sep)
+list_data_frames <- function(tables, sep = "\t", stringsAsFactors = FALSE, encoding = NULL, ...) {
+    char <- list_characters(tables = tables, sep = sep, encoding = encoding)
     lapply(char, function(x) {
         o <- try(read.delim(text = x, stringsAsFactors = stringsAsFactors, ...))
         if (inherits(o, "try-error")) {

diff --git a/R/utils.R b/R/utils.R
@@ -1,8 +1,8 @@
-localize_file <- function(path, copy = FALSE) {
+localize_file <- function(path, copy = FALSE, quiet = TRUE) {
     if (grepl("^http.*://", path)) {
         if (copy) {
             tmp <- tempfile(fileext = ".pdf")
-            download.file(path, tmp, mode = "wb")
+            download.file(path, tmp, quiet = quiet, mode = "wb")
             path <- tmp
         } else {
             path <- new(J("java.net.URL"), path)

diff --git a/man/extract_tables.Rd b/man/extract_tables.Rd
diff --git a/man/extract_text.Rd b/man/extract_text.Rd
diff --git a/tests/testthat.R b/tests/testthat.R
@@ -1,4 +1,5 @@
 library("testthat")
 library("tabulizer")
 
+stop_logging()
 test_check("tabulizer")
diff --git a/tests/testthat/test_non-latin.R b/tests/testthat/test_non-latin.R
@@ -0,0 +1,19 @@
+context("Non-latin character tests")
+
+test_that("Read Spanish language PDF", {
+    f1 <- "https://github.com/tabulapdf/tabula-java/raw/98957221950af4b90620b51a29e0bbe502eea9ad/src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf"
+    expect_true(is.matrix(extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)))[[1]]))
+    t1a <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), method = "data.frame", encoding = "latin1")
+    #expect_true(names(t1a[[1]])[2] == "Frente.CÃ.vico.por.Santiago", label = "latin1 encoding worked")
+    t1b <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), method = "data.frame", encoding = "UTF-8")
+    #expect_true(names(t1b[[1]])[2] == "Frente.Cívico.por.Santiago", label = "UTF-8 encoding worked")
+
+})
+
+test_that("Read French language PDF w/correct encoding", {
+    f2 <- "http://journal-sfds.fr/index.php/J-SFdS/article/download/514/486"
+    t2a <- extract_text(f2, page = 1, encoding = "latin1")[[1]]
+    t2b <- extract_text(f2, page = 1, encoding = "UTF-8")[[1]]
+    expect_true(nchar(strsplit(t2a, "\n")[[1]][1]) == 50, label = "latin1 encoding worked")
+    expect_true(nchar(strsplit(t2b, "\n")[[1]][1]) == 47, label = "UTF-8 encoding worked")
+})