From 19cec9be15c16d827b038e693e79c74e2ae201b6 Mon Sep 17 00:00:00 2001 From: Thore Engel Date: Wed, 5 Jun 2019 09:19:26 +0200 Subject: [PATCH 1/5] Add locate_columns() function locate_columns allows users to manually locate separators between columns. It adopts the approach of the "reduced" wiget from the related function locate_areas(). --- NAMESPACE | 1 + R/locate_columns.R | 106 +++++++++++++++++++++++++++++++++++++ man/locate_columns.Rd | 35 ++++++++++++ man/try_columns_reduced.Rd | 18 +++++++ 4 files changed, 160 insertions(+) create mode 100644 R/locate_columns.R create mode 100644 man/locate_columns.Rd create mode 100644 man/try_columns_reduced.Rd diff --git a/NAMESPACE b/NAMESPACE index deb9c46..4796d19 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(extract_text) export(get_n_pages) export(get_page_dims) export(locate_areas) +export(locate_columns) export(make_thumbnails) export(merge_pdfs) export(split_pdf) diff --git a/R/locate_columns.R b/R/locate_columns.R new file mode 100644 index 0000000..c094bcc --- /dev/null +++ b/R/locate_columns.R @@ -0,0 +1,106 @@ + +#' Locate separators between columns +#' +#' This function allows the user to manually locate the separators between columns of a table in a pdf. The output can be used as the \code{columns} argument in \code{extract_tables()} +#' +#' Manually selecting the separators can ensure that values stay in their respective columns. This is useful when some rows of a table have no or only little white space between columns. The code is an adaptation of the \code{locate_area()} function and its helpers. +#' @param file A character string specifying the path or URL to a PDF file. +#' @param pages An optional integer vector specifying pages to extract from. +#' @param resolution An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading. +#' @param copy Specifies whether the original local file(s) should be copied to tempdir() before processing. FALSE by default. The argument is ignored if file is URL. +#' @return a list. +#' @author Thore Engel +#' @export +#' +#' @examples +#' f <- system.file("examples", "data.pdf", package = "tabulizer") +#' separators<-locate_columns(f, pages= 1 ) +#' extract_tables(f,pages = 1, columns = separators[1]) +#' +locate_columns <- function(file, + pages = NULL, + resolution = 60L, + copy = FALSE) { + if (!interactive()) { + stop("locate_columns() is only available in an interactive session") + } else { + requireNamespace("graphics") + requireNamespace("grDevices") + } + + file <- localize_file(file, copy = copy) + # on.exit(unlink(file), add = TRUE) + dims <- get_page_dims(file, pages = pages) + paths <- make_thumbnails(file, + outdir = tempdir(), + pages = pages, + format = "png", + resolution = resolution) + on.exit(unlink(paths), add = TRUE) + + separators <- rep(list(NULL), length(paths)) + i <- 1 + warnThisTime <- TRUE + while (TRUE) { + if (!is.na(paths[i])) { + a <- try_columns_reduced(file = paths[i], + dims = dims[[i]],warn = warnThisTime) + if(warnThisTime) warnThisTime <- F + if (!is.null(a[["separators"]])) { + separators[[i]] <- a[["separators"]] + } + if (tolower(a[["key"]]) %in% c("del", "delete", "ctrl-h")) { + separators[i] <- list(NULL) + next + } else if (tolower(a[["key"]]) %in% c("home")) { + i <- 1 + next + } else if (tolower(a[["key"]]) %in% c("end")) { + i <- length(paths) + next + } else if (tolower(a[["key"]]) %in% c("pgup", "page_up", "up", "left")) { + i <- if (i == 1) 1 else i - 1 + next + } else if (tolower(a[["key"]]) %in% c("q")) { + break + } + } + i <- i + 1 + if (i > length(paths)) { + break + } + } + return(separators) +} + + +#' Helper function to locate_columns() +#' +#' @param file +#' @param dims +#' @param warn + +try_columns_reduced <- function(file, dims, warn = FALSE) { + if (warn) { + message("Click at the locations of separators between columns.") + } + if (grDevices::dev.capabilities()[["rasterImage"]] == "no") { + stop("Graphics device does not support rasterImage() plotting") + } + thispng <- readPNG(file, native = TRUE) + drawPage <- function() { + graphics::plot(c(0, dims[1]), c(0, dims[2]), type = "n", xlab = "", ylab = "", asp = 1) + graphics::rasterImage(thispng, 0, 0, dims[1], dims[2]) + } + + pre_par <- graphics::par(mar=c(0,0,0,0), xaxs = "i", yaxs = "i", bty = "n") + on.exit(graphics::par(pre_par), add = TRUE) + drawPage() + on.exit(grDevices::dev.off(), add = TRUE) + + tmp <- locator() + graphics::abline(v=tmp$x) + Sys.sleep(4) + separators= as.numeric(tmp$x) + return(list(key = "right", separators = separators)) +} diff --git a/man/locate_columns.Rd b/man/locate_columns.Rd new file mode 100644 index 0000000..6fc2aa4 --- /dev/null +++ b/man/locate_columns.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/locate_columns.R +\name{locate_columns} +\alias{locate_columns} +\title{Locate separators between columns} +\usage{ +locate_columns(file, pages = NULL, resolution = 60L, copy = FALSE) +} +\arguments{ +\item{file}{A character string specifying the path or URL to a PDF file.} + +\item{pages}{An optional integer vector specifying pages to extract from.} + +\item{resolution}{An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading.} + +\item{copy}{Specifies whether the original local file(s) should be copied to tempdir() before processing. FALSE by default. The argument is ignored if file is URL.} +} +\value{ +a list. +} +\description{ +This function allows the user to manually locate the separators between columns of a table in a pdf. The output can be used as the \code{columns} argument in \code{extract_tables()} +} +\details{ +Manually selecting the separators can ensure that values stay in their respective columns. This is useful when some rows of a table have no or only little white space between columns. The code is an adaptation of the \code{locate_area()} function and its helpers. +} +\examples{ +f <- system.file("examples", "data.pdf", package = "tabulizer") +separators<-locate_columns(f, pages= 1 ) +extract_tables(f,pages = 1, columns = separators[1]) + +} +\author{ +Thore Engel +} diff --git a/man/try_columns_reduced.Rd b/man/try_columns_reduced.Rd new file mode 100644 index 0000000..6abbf6c --- /dev/null +++ b/man/try_columns_reduced.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/locate_columns.R +\name{try_columns_reduced} +\alias{try_columns_reduced} +\title{Helper function to locate_columns()} +\usage{ +try_columns_reduced(file, dims, warn = FALSE) +} +\arguments{ +\item{file}{} + +\item{dims}{} + +\item{warn}{} +} +\description{ +Helper function to locate_columns() +} From 7a21e5a59e4b19be59cd63ce2455a0f2b99ac9b1 Mon Sep 17 00:00:00 2001 From: Thore Engel Date: Wed, 5 Jun 2019 09:19:26 +0200 Subject: [PATCH 2/5] Add locate_columns() function locate_columns allows users to manually locate separators between columns. It adopts the approach of the "reduced" wiget from the related function locate_areas(). --- DESCRIPTION | 6 ++- NAMESPACE | 1 + NEWS.md | 3 ++ R/locate_columns.R | 106 +++++++++++++++++++++++++++++++++++++ man/locate_columns.Rd | 35 ++++++++++++ man/try_columns_reduced.Rd | 18 +++++++ 6 files changed, 167 insertions(+), 2 deletions(-) create mode 100644 R/locate_columns.R create mode 100644 man/locate_columns.Rd create mode 100644 man/try_columns_reduced.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 305cf1a..065425c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,9 @@ Authors@R: c(person("Thomas J.", "Leeper", comment = "rOpenSci reviewer"), person("Lincoln", "Mullen", role = "ctb", - comment = "rOpenSci reviewer")) + comment = "rOpenSci reviewer"), + person("Thore", "Engel", + role = "ctb")) Maintainer: Tom Paskhalis Description: Bindings for the 'Tabula' 'Java' library, which can extract tables from PDF documents. The 'tabulizerjars' @@ -40,4 +42,4 @@ Suggests: testthat SystemRequirements: Java (>= 7.0) VignetteBuilder: knitr -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index deb9c46..4796d19 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(extract_text) export(get_n_pages) export(get_page_dims) export(locate_areas) +export(locate_columns) export(make_thumbnails) export(merge_pdfs) export(split_pdf) diff --git a/NEWS.md b/NEWS.md index caae7e8..bf731ff 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,11 @@ + + # CHANGES TO tabulizer 0.2.2 * `extract_tables()` gets `outdir` argument for writing out CSV, TSV and JSON files. * Fixes in vignette. +* addition of `locate_columns()` function. # CHANGES TO tabulizer 0.2.1 diff --git a/R/locate_columns.R b/R/locate_columns.R new file mode 100644 index 0000000..c094bcc --- /dev/null +++ b/R/locate_columns.R @@ -0,0 +1,106 @@ + +#' Locate separators between columns +#' +#' This function allows the user to manually locate the separators between columns of a table in a pdf. The output can be used as the \code{columns} argument in \code{extract_tables()} +#' +#' Manually selecting the separators can ensure that values stay in their respective columns. This is useful when some rows of a table have no or only little white space between columns. The code is an adaptation of the \code{locate_area()} function and its helpers. +#' @param file A character string specifying the path or URL to a PDF file. +#' @param pages An optional integer vector specifying pages to extract from. +#' @param resolution An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading. +#' @param copy Specifies whether the original local file(s) should be copied to tempdir() before processing. FALSE by default. The argument is ignored if file is URL. +#' @return a list. +#' @author Thore Engel +#' @export +#' +#' @examples +#' f <- system.file("examples", "data.pdf", package = "tabulizer") +#' separators<-locate_columns(f, pages= 1 ) +#' extract_tables(f,pages = 1, columns = separators[1]) +#' +locate_columns <- function(file, + pages = NULL, + resolution = 60L, + copy = FALSE) { + if (!interactive()) { + stop("locate_columns() is only available in an interactive session") + } else { + requireNamespace("graphics") + requireNamespace("grDevices") + } + + file <- localize_file(file, copy = copy) + # on.exit(unlink(file), add = TRUE) + dims <- get_page_dims(file, pages = pages) + paths <- make_thumbnails(file, + outdir = tempdir(), + pages = pages, + format = "png", + resolution = resolution) + on.exit(unlink(paths), add = TRUE) + + separators <- rep(list(NULL), length(paths)) + i <- 1 + warnThisTime <- TRUE + while (TRUE) { + if (!is.na(paths[i])) { + a <- try_columns_reduced(file = paths[i], + dims = dims[[i]],warn = warnThisTime) + if(warnThisTime) warnThisTime <- F + if (!is.null(a[["separators"]])) { + separators[[i]] <- a[["separators"]] + } + if (tolower(a[["key"]]) %in% c("del", "delete", "ctrl-h")) { + separators[i] <- list(NULL) + next + } else if (tolower(a[["key"]]) %in% c("home")) { + i <- 1 + next + } else if (tolower(a[["key"]]) %in% c("end")) { + i <- length(paths) + next + } else if (tolower(a[["key"]]) %in% c("pgup", "page_up", "up", "left")) { + i <- if (i == 1) 1 else i - 1 + next + } else if (tolower(a[["key"]]) %in% c("q")) { + break + } + } + i <- i + 1 + if (i > length(paths)) { + break + } + } + return(separators) +} + + +#' Helper function to locate_columns() +#' +#' @param file +#' @param dims +#' @param warn + +try_columns_reduced <- function(file, dims, warn = FALSE) { + if (warn) { + message("Click at the locations of separators between columns.") + } + if (grDevices::dev.capabilities()[["rasterImage"]] == "no") { + stop("Graphics device does not support rasterImage() plotting") + } + thispng <- readPNG(file, native = TRUE) + drawPage <- function() { + graphics::plot(c(0, dims[1]), c(0, dims[2]), type = "n", xlab = "", ylab = "", asp = 1) + graphics::rasterImage(thispng, 0, 0, dims[1], dims[2]) + } + + pre_par <- graphics::par(mar=c(0,0,0,0), xaxs = "i", yaxs = "i", bty = "n") + on.exit(graphics::par(pre_par), add = TRUE) + drawPage() + on.exit(grDevices::dev.off(), add = TRUE) + + tmp <- locator() + graphics::abline(v=tmp$x) + Sys.sleep(4) + separators= as.numeric(tmp$x) + return(list(key = "right", separators = separators)) +} diff --git a/man/locate_columns.Rd b/man/locate_columns.Rd new file mode 100644 index 0000000..6fc2aa4 --- /dev/null +++ b/man/locate_columns.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/locate_columns.R +\name{locate_columns} +\alias{locate_columns} +\title{Locate separators between columns} +\usage{ +locate_columns(file, pages = NULL, resolution = 60L, copy = FALSE) +} +\arguments{ +\item{file}{A character string specifying the path or URL to a PDF file.} + +\item{pages}{An optional integer vector specifying pages to extract from.} + +\item{resolution}{An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading.} + +\item{copy}{Specifies whether the original local file(s) should be copied to tempdir() before processing. FALSE by default. The argument is ignored if file is URL.} +} +\value{ +a list. +} +\description{ +This function allows the user to manually locate the separators between columns of a table in a pdf. The output can be used as the \code{columns} argument in \code{extract_tables()} +} +\details{ +Manually selecting the separators can ensure that values stay in their respective columns. This is useful when some rows of a table have no or only little white space between columns. The code is an adaptation of the \code{locate_area()} function and its helpers. +} +\examples{ +f <- system.file("examples", "data.pdf", package = "tabulizer") +separators<-locate_columns(f, pages= 1 ) +extract_tables(f,pages = 1, columns = separators[1]) + +} +\author{ +Thore Engel +} diff --git a/man/try_columns_reduced.Rd b/man/try_columns_reduced.Rd new file mode 100644 index 0000000..6abbf6c --- /dev/null +++ b/man/try_columns_reduced.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/locate_columns.R +\name{try_columns_reduced} +\alias{try_columns_reduced} +\title{Helper function to locate_columns()} +\usage{ +try_columns_reduced(file, dims, warn = FALSE) +} +\arguments{ +\item{file}{} + +\item{dims}{} + +\item{warn}{} +} +\description{ +Helper function to locate_columns() +} From f9d471bd8fcd185a094e84c172839849ae4b1ce5 Mon Sep 17 00:00:00 2001 From: Thore Engel Date: Wed, 5 Jun 2019 12:45:47 +0200 Subject: [PATCH 3/5] Add `\don't` test to example --- R/locate_columns.R | 2 ++ man/locate_columns.Rd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R/locate_columns.R b/R/locate_columns.R index c094bcc..06d0743 100644 --- a/R/locate_columns.R +++ b/R/locate_columns.R @@ -13,9 +13,11 @@ #' @export #' #' @examples +#' \donttest{ #' f <- system.file("examples", "data.pdf", package = "tabulizer") #' separators<-locate_columns(f, pages= 1 ) #' extract_tables(f,pages = 1, columns = separators[1]) +#' } #' locate_columns <- function(file, pages = NULL, diff --git a/man/locate_columns.Rd b/man/locate_columns.Rd index 6fc2aa4..0e1f06a 100644 --- a/man/locate_columns.Rd +++ b/man/locate_columns.Rd @@ -25,9 +25,11 @@ This function allows the user to manually locate the separators between columns Manually selecting the separators can ensure that values stay in their respective columns. This is useful when some rows of a table have no or only little white space between columns. The code is an adaptation of the \code{locate_area()} function and its helpers. } \examples{ +\donttest{ f <- system.file("examples", "data.pdf", package = "tabulizer") separators<-locate_columns(f, pages= 1 ) extract_tables(f,pages = 1, columns = separators[1]) +} } \author{ From c5741bb1470e65b574fbe6fddb0530c3873d86c3 Mon Sep 17 00:00:00 2001 From: Thore Engel Date: Wed, 5 Jun 2019 13:02:09 +0200 Subject: [PATCH 4/5] Add description to try_columns_reduced() --- R/locate_columns.R | 6 +++--- man/try_columns_reduced.Rd | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/locate_columns.R b/R/locate_columns.R index 06d0743..cddaeda 100644 --- a/R/locate_columns.R +++ b/R/locate_columns.R @@ -78,9 +78,9 @@ locate_columns <- function(file, #' Helper function to locate_columns() #' -#' @param file -#' @param dims -#' @param warn +#' @param file A character string specifying the path or URL to a PDF file. +#' @param dims An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading. +#' @param warn Display warning? try_columns_reduced <- function(file, dims, warn = FALSE) { if (warn) { diff --git a/man/try_columns_reduced.Rd b/man/try_columns_reduced.Rd index 6abbf6c..4665e66 100644 --- a/man/try_columns_reduced.Rd +++ b/man/try_columns_reduced.Rd @@ -7,11 +7,11 @@ try_columns_reduced(file, dims, warn = FALSE) } \arguments{ -\item{file}{} +\item{file}{A character string specifying the path or URL to a PDF file.} -\item{dims}{} +\item{dims}{An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading.} -\item{warn}{} +\item{warn}{Display warning?} } \description{ Helper function to locate_columns() From 7723d2bbf60d6b91929d4d93250fa95075c4d75d Mon Sep 17 00:00:00 2001 From: Thore Engel Date: Fri, 10 Jan 2020 14:03:23 +0100 Subject: [PATCH 5/5] Formatting --- man/extract_areas.Rd | 4 ++-- man/make_thumbnails.Rd | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/man/extract_areas.Rd b/man/extract_areas.Rd index 41a7fea..950dd90 100644 --- a/man/extract_areas.Rd +++ b/man/extract_areas.Rd @@ -5,8 +5,8 @@ \alias{extract_areas} \title{extract_areas} \usage{ -locate_areas(file, pages = NULL, resolution = 60L, widget = c("shiny", - "native", "reduced"), copy = FALSE) +locate_areas(file, pages = NULL, resolution = 60L, + widget = c("shiny", "native", "reduced"), copy = FALSE) extract_areas(file, pages = NULL, guess = FALSE, copy = FALSE, ...) } diff --git a/man/make_thumbnails.Rd b/man/make_thumbnails.Rd index 2f00dd1..beee5a4 100644 --- a/man/make_thumbnails.Rd +++ b/man/make_thumbnails.Rd @@ -5,7 +5,8 @@ \title{make_thumbnails} \usage{ make_thumbnails(file, outdir = NULL, pages = NULL, format = c("png", - "jpeg", "bmp", "gif"), resolution = 72, password = NULL, copy = FALSE) + "jpeg", "bmp", "gif"), resolution = 72, password = NULL, + copy = FALSE) } \arguments{ \item{file}{A character string specifying the path or URL to a PDF file.}