Skip to content

Commit

Permalink
pass optional encodings to handle non-latin text (closes #10)
Browse files Browse the repository at this point in the history
  • Loading branch information
leeper committed May 31, 2016
1 parent f86cc5c commit 71c3b82
Show file tree
Hide file tree
Showing 10 changed files with 51 additions and 14 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: tabulizer
Type: Package
Title: Bindings for Tabula PDF Table Extractor Library
Version: 0.1.14
Version: 0.1.15
Date: 2016-05-31
Authors@R: c(person("Thomas J.", "Leeper", role = c("aut", "cre"),
email = "[email protected]"),
Expand Down
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# CHANGES TO tabulizer 0.1.15 #

* Added better support for specifying character encoding. (#10)

# CHANGES TO tabulizer 0.1.14 #

* Added support for password-protected PDF files. (#11)
Expand Down
8 changes: 5 additions & 3 deletions R/extract_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @param spreadsheet A logical indicating whether to use Tabula's spreadsheet extraction algorithm. If \code{NULL} (the default), an automated assessment is made about whether it is appropriate.
#' @param method A function to coerce the Java response object (a Java ArrayList of Tabula Tables) to some output format. The default method, \dQuote{matrices}, returns a list of character matrices. See Details for other options.
#' @param password Optionally, a character string containing a user password to access a secured PDF.
#' @param encoding Optionally, a character string specifying an encoding for the text, to be passed to the assignment method of \code{\link[base]{Encoding}}.
#' @param \dots These are additional arguments passed to the internal functions dispatched by \code{method}.
#' @details This function mimics the behavior of the Tabula command line utility. It returns a list of R character matrices containing tables extracted from a file by default. This response behavior can be changed by using the following options. \code{method = "character"} returns a list of single-element character vectors, where each vector is a tab-delimited, line-separate string of concatenated table cells. \code{method = "data.frame"} attempts to coerce the structure returned by \code{method = "character"} into a list of data.frames and returns character strings where this fails. \code{method = "csv"} writes the tables to comma-separated (CSV) files using Tabula's CSVWriter method in the same directory as the original PDF. \code{method = "tsv"} does the same but with tab-separated (TSV) files using Tabula's TSVWriter and \code{method = "json"} does the same using Tabula's JSONWriter method. The previous three methods all return the path to the directory containing the extract table files. \code{method = "asis"} returns the Java object reference, which can be useful for debugging or for writing a custom parser.
#' \code{\link{extract_areas}} implements this functionality in an interactive mode allowing the user to specify extraction areas for each page.
Expand Down Expand Up @@ -49,6 +50,7 @@ function(file,
spreadsheet = NULL,
method = "matrix",
password = NULL,
encoding = NULL,
...) {

pdfDocument <- load_doc(file, password = password)
Expand Down Expand Up @@ -115,9 +117,9 @@ function(file,
"csv" = write_csvs(tables, file = file, ...),
"tsv" = write_tsvs(tables, file = file, ...),
"json" = write_jsons(tables, file = file, ...),
"character" = list_characters(tables, ...),
"matrix" = list_matrices(tables, ...),
"data.frame" = list_data_frames(tables, ...),
"character" = list_characters(tables, encoding = encoding, ...),
"matrix" = list_matrices(tables, encoding = encoding, ...),
"data.frame" = list_data_frames(tables, encoding = encoding, ...),
"asis" = tables,
tables)
}
6 changes: 5 additions & 1 deletion R/extract_text.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#' @param file A character string specifying the path or URL to a PDF file.
#' @param pages An optional integer vector specifying pages to extract from.
#' @param password Optionally, a character string containing a user password to access a secured PDF.
#' @param encoding Optionally, a character string specifying an encoding for the text, to be passed to the assignment method of \code{\link[base]{Encoding}}.
#' @details This function converts the contents of a PDF file into a single unstructured character string.
#' @return If \code{pages = NULL} (the default), a length 1 character vector, otherwise a vector of length \code{length(pages)}.
#' @author Thomas J. Leeper <[email protected]>
Expand All @@ -20,7 +21,7 @@
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{split_pdf}}
#' @importFrom rJava J new
#' @export
extract_text <- function(file, pages = NULL, password = NULL) {
extract_text <- function(file, pages = NULL, password = NULL, encoding = NULL) {
pdfDocument <- load_doc(file, password = password)
on.exit(pdfDocument$close())

Expand All @@ -36,5 +37,8 @@ extract_text <- function(file, pages = NULL, password = NULL) {
} else {
out <- stripper$getText(pdfDocument)
}
if (!is.null(encoding)) {
Encoding(out) <- encoding
}
out
}
13 changes: 8 additions & 5 deletions R/output.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ write_jsons <- function(tables, file, ...) {
normalizePath(dirname(file))
}

list_matrices <- function(tables, ...) {
list_matrices <- function(tables, encoding = NULL, ...) {
out <- list()
n <- 1L
tablesIterator <- tables$iterator()
Expand All @@ -63,21 +63,24 @@ list_matrices <- function(tables, ...) {
out[[n]][i, j] <- tab$getCell(i-1L, j-1L)$getText()
}
}
if (!is.null(encoding)) {
Encoding(out[[n]]) <- encoding
}
rm(tab)
n <- n + 1L
}
out
}

list_characters <- function(tables, sep = "\t", ...) {
m <- list_matrices(tables, ...)
list_characters <- function(tables, sep = "\t", encoding = NULL, ...) {
m <- list_matrices(tables, encoding = encoding, ...)
lapply(m, function(x) {
paste0(apply(x, 1, paste, collapse = sep), collapse = "\n")
})
}

list_data_frames <- function(tables, sep = "\t", stringsAsFactors = FALSE, ...) {
char <- list_characters(tables = tables, sep = sep)
list_data_frames <- function(tables, sep = "\t", stringsAsFactors = FALSE, encoding = NULL, ...) {
char <- list_characters(tables = tables, sep = sep, encoding = encoding)
lapply(char, function(x) {
o <- try(read.delim(text = x, stringsAsFactors = stringsAsFactors, ...))
if (inherits(o, "try-error")) {
Expand Down
4 changes: 2 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
localize_file <- function(path, copy = FALSE) {
localize_file <- function(path, copy = FALSE, quiet = TRUE) {
if (grepl("^http.*://", path)) {
if (copy) {
tmp <- tempfile(fileext = ".pdf")
download.file(path, tmp, mode = "wb")
download.file(path, tmp, quiet = quiet, mode = "wb")
path <- tmp
} else {
path <- new(J("java.net.URL"), path)
Expand Down
4 changes: 3 additions & 1 deletion man/extract_tables.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/extract_text.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions tests/testthat.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
library("testthat")
library("tabulizer")

stop_logging()
test_check("tabulizer")
19 changes: 19 additions & 0 deletions tests/testthat/test_non-latin.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
context("Non-latin character tests")

test_that("Read Spanish language PDF", {
f1 <- "https://github.com/tabulapdf/tabula-java/raw/98957221950af4b90620b51a29e0bbe502eea9ad/src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf"
expect_true(is.matrix(extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)))[[1]]))
t1a <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), method = "data.frame", encoding = "latin1")
#expect_true(names(t1a[[1]])[2] == "Frente.CÃ.vico.por.Santiago", label = "latin1 encoding worked")
t1b <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), method = "data.frame", encoding = "UTF-8")
#expect_true(names(t1b[[1]])[2] == "Frente.Cívico.por.Santiago", label = "UTF-8 encoding worked")

})

test_that("Read French language PDF w/correct encoding", {
f2 <- "http://journal-sfds.fr/index.php/J-SFdS/article/download/514/486"
t2a <- extract_text(f2, page = 1, encoding = "latin1")[[1]]
t2b <- extract_text(f2, page = 1, encoding = "UTF-8")[[1]]
expect_true(nchar(strsplit(t2a, "\n")[[1]][1]) == 50, label = "latin1 encoding worked")
expect_true(nchar(strsplit(t2b, "\n")[[1]][1]) == 47, label = "UTF-8 encoding worked")
})

0 comments on commit 71c3b82

Please sign in to comment.