-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
pass optional encodings to handle non-latin text (closes #10)
- Loading branch information
Showing
10 changed files
with
51 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
Package: tabulizer | ||
Type: Package | ||
Title: Bindings for Tabula PDF Table Extractor Library | ||
Version: 0.1.14 | ||
Version: 0.1.15 | ||
Date: 2016-05-31 | ||
Authors@R: c(person("Thomas J.", "Leeper", role = c("aut", "cre"), | ||
email = "[email protected]"), | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
#' @param file A character string specifying the path or URL to a PDF file. | ||
#' @param pages An optional integer vector specifying pages to extract from. | ||
#' @param password Optionally, a character string containing a user password to access a secured PDF. | ||
#' @param encoding Optionally, a character string specifying an encoding for the text, to be passed to the assignment method of \code{\link[base]{Encoding}}. | ||
#' @details This function converts the contents of a PDF file into a single unstructured character string. | ||
#' @return If \code{pages = NULL} (the default), a length 1 character vector, otherwise a vector of length \code{length(pages)}. | ||
#' @author Thomas J. Leeper <[email protected]> | ||
|
@@ -20,7 +21,7 @@ | |
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{split_pdf}} | ||
#' @importFrom rJava J new | ||
#' @export | ||
extract_text <- function(file, pages = NULL, password = NULL) { | ||
extract_text <- function(file, pages = NULL, password = NULL, encoding = NULL) { | ||
pdfDocument <- load_doc(file, password = password) | ||
on.exit(pdfDocument$close()) | ||
|
||
|
@@ -36,5 +37,8 @@ extract_text <- function(file, pages = NULL, password = NULL) { | |
} else { | ||
out <- stripper$getText(pdfDocument) | ||
} | ||
if (!is.null(encoding)) { | ||
Encoding(out) <- encoding | ||
} | ||
out | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
library("testthat") | ||
library("tabulizer") | ||
|
||
stop_logging() | ||
test_check("tabulizer") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
context("Non-latin character tests") | ||
|
||
test_that("Read Spanish language PDF", { | ||
f1 <- "https://github.com/tabulapdf/tabula-java/raw/98957221950af4b90620b51a29e0bbe502eea9ad/src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf" | ||
expect_true(is.matrix(extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)))[[1]])) | ||
t1a <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), method = "data.frame", encoding = "latin1") | ||
#expect_true(names(t1a[[1]])[2] == "Frente.CÃ.vico.por.Santiago", label = "latin1 encoding worked") | ||
t1b <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), method = "data.frame", encoding = "UTF-8") | ||
#expect_true(names(t1b[[1]])[2] == "Frente.Cívico.por.Santiago", label = "UTF-8 encoding worked") | ||
|
||
}) | ||
|
||
test_that("Read French language PDF w/correct encoding", { | ||
f2 <- "http://journal-sfds.fr/index.php/J-SFdS/article/download/514/486" | ||
t2a <- extract_text(f2, page = 1, encoding = "latin1")[[1]] | ||
t2b <- extract_text(f2, page = 1, encoding = "UTF-8")[[1]] | ||
expect_true(nchar(strsplit(t2a, "\n")[[1]][1]) == 50, label = "latin1 encoding worked") | ||
expect_true(nchar(strsplit(t2b, "\n")[[1]][1]) == 47, label = "UTF-8 encoding worked") | ||
}) |