Skip to content

Commit

Permalink
small PR to use cpp11
Browse files Browse the repository at this point in the history
  • Loading branch information
pachadotdev committed Aug 3, 2024
1 parent 9131570 commit 156873b
Show file tree
Hide file tree
Showing 14 changed files with 384 additions and 390 deletions.
31 changes: 19 additions & 12 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,28 +1,35 @@
Package: tesseract
Type: Package
Title: Open Source OCR Engine
Version: 5.2.1
Authors@R: person("Jeroen", "Ooms", role = c("aut", "cre"), email = "[email protected]",
comment = c(ORCID = "0000-0002-4035-0289"))
Version: 5.3.0
Authors@R: c(person("Jeroen", "Ooms",
role = c("aut", "cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0002-4035-0289")),
person("Mauricio", "Vargas Sepulveda",
role = "aut",
email = "[email protected]",
comment = c(ORCID = "0000-0003-1017-7574")))
Description: Bindings to 'Tesseract':
a powerful optical character recognition (OCR) engine that supports over 100 languages.
The engine is highly configurable in order to tune the detection algorithms and
obtain the best possible results.
a powerful optical character recognition (OCR) engine that supports over
100 languages. The engine is highly configurable in order to tune the
detection algorithms and obtain the best possible results.
License: Apache License 2.0
URL: https://docs.ropensci.org/tesseract/ (website)
https://github.com/ropensci/tesseract (devel)
BugReports: https://github.com/ropensci/tesseract/issues
SystemRequirements: Tesseract >= 3.03 (libtesseract-dev / tesseract-devel) and
Leptonica (libleptonica-dev / leptonica-devel). On Debian you need to install
the English training data separately (tesseract-ocr-eng)
SystemRequirements: Tesseract >= 5.0.0 (libtesseract-dev / tesseract-devel) and
Leptonica (libleptonica-dev / leptonica-devel). On Debian you need to
install the English and other languages training data separately
(e.g. tesseract-ocr-eng or tesseract-ocr-spa).
Imports:
Rcpp (>= 0.12.12),
pdftools (>= 1.5),
curl,
rappdirs,
digest
LinkingTo: Rcpp
RoxygenNote: 7.2.3
LinkingTo:
cpp11
RoxygenNote: 7.3.1
Roxygen: list(markdown = TRUE)
Suggests:
magick (>= 1.7),
Expand Down
3 changes: 1 addition & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,4 @@ export(tesseract)
export(tesseract_download)
export(tesseract_info)
export(tesseract_params)
importFrom(Rcpp,sourceCpp)
useDynLib(tesseract)
useDynLib(tesseract, .registration = TRUE)
47 changes: 0 additions & 47 deletions R/RcppExports.R

This file was deleted.

45 changes: 45 additions & 0 deletions R/cpp11.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Generated by cpp11: do not edit by hand

tesseract_config <- function() {
.Call(`_tesseract_tesseract_config`)
}

tesseract_engine_internal <- function(datapath, language, confpaths, opt_names, opt_values) {
.Call(`_tesseract_tesseract_engine_internal`, datapath, language, confpaths, opt_names, opt_values)
}

tesseract_engine_set_variable <- function(ptr, name, value) {
.Call(`_tesseract_tesseract_engine_set_variable`, ptr, name, value)
}

validate_params <- function(params) {
.Call(`_tesseract_validate_params`, params)
}

engine_info_internal <- function(ptr) {
.Call(`_tesseract_engine_info_internal`, ptr)
}

print_params <- function(filename) {
.Call(`_tesseract_print_params`, filename)
}

get_param_values <- function(api, params) {
.Call(`_tesseract_get_param_values`, api, params)
}

ocr_raw <- function(input, ptr, HOCR) {
.Call(`_tesseract_ocr_raw`, input, ptr, HOCR)
}

ocr_file <- function(file, ptr, HOCR) {
.Call(`_tesseract_ocr_file`, file, ptr, HOCR)
}

ocr_raw_data <- function(input, ptr) {
.Call(`_tesseract_ocr_raw_data`, input, ptr)
}

ocr_file_data <- function(file, ptr) {
.Call(`_tesseract_ocr_file_data`, file, ptr)
}
6 changes: 2 additions & 4 deletions R/ocr.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,13 @@
#' each word in the text.
#'
#' @export
#' @useDynLib tesseract
#' @family tesseract
#' @param image file path, url, or raw vector to image (png, tiff, jpeg, etc)
#' @param engine a tesseract engine created with [tesseract()]. Alternatively a
#' language string which will be passed to [tesseract()].
#' @param HOCR if `TRUE` return results as HOCR xml instead of plain text
#' @rdname ocr
#' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
#' @importFrom Rcpp sourceCpp
#' @examples # Simple example
#' text <- ocr("https://jeroen.github.io/images/testocr.png")
#' cat(text)
Expand Down Expand Up @@ -48,7 +46,7 @@
ocr <- function(image, engine = tesseract("eng"), HOCR = FALSE) {
if(is.character(engine))
engine <- tesseract(engine)
stopifnot(inherits(engine, "tesseract"))
stopifnot(inherits(engine, "externalptr"))
if(inherits(image, "magick-image")){
vapply(image, function(x){
tmp <- tempfile(fileext = ".png")
Expand All @@ -71,7 +69,7 @@ ocr <- function(image, engine = tesseract("eng"), HOCR = FALSE) {
ocr_data <- function(image, engine = tesseract("eng")) {
if(is.character(engine))
engine <- tesseract(engine)
stopifnot(inherits(engine, "tesseract"))
stopifnot(inherits(engine, "externalptr"))
df_list <- if(inherits(image, "magick-image")){
lapply(image, function(x){
tmp <- tempfile(fileext = ".png")
Expand Down
11 changes: 11 additions & 0 deletions R/tesseract-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#' @title Open Source OCR Engine
#'
#' @description
#' Bindings to 'Tesseract':
#' a powerful optical character recognition (OCR) engine that supports over 100
#' languages. The engine is highly configurable in order to tune the detection
#' algorithms and obtain the best possible results.
#'
#' @name tesseract-package
#' @useDynLib tesseract, .registration = TRUE
"_PACKAGE"
4 changes: 4 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
chinese
config
EPEL
github
Expand All @@ -15,12 +16,15 @@ Magick
Nederlands
ocr
opensource
ORCID
pdftools
png
rmarkdown
spanish
Sepulveda
tessdata
toc
utrecht
VignetteEncoding
VignetteEngine
VignetteIndexEntry
4 changes: 2 additions & 2 deletions man/ocr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions man/tesseract-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.o
*.so
*.dll
Loading

0 comments on commit 156873b

Please sign in to comment.