From 096eb48bbdb00711a68217239cbda988cb8a27bc Mon Sep 17 00:00:00 2001 From: mara004 Date: Tue, 15 Mar 2022 17:28:08 +0100 Subject: [PATCH] refactor: Updated version of pypdfium2 and its calls (#845) * Switch to new pypdfium2 API * conf.py: fix intersphinx link * apply review suggestions * Fix the intersphinx function reference I need to do something to make these paths shorter. --- docs/source/conf.py | 6 ++++++ doctr/io/pdf.py | 14 +++++++++----- requirements-pt.txt | 2 +- requirements.txt | 2 +- setup.py | 2 +- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index c8110bc5da..1ce4609a6b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -39,6 +39,7 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.napoleon', + 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', @@ -49,6 +50,11 @@ 'sphinx_markdown_tables', ] +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'pypdfium2': ('https://pypdfium2.readthedocs.io/en/stable/', None), +} + napoleon_use_ivar = True # Add any paths that contain templates here, relative to this directory. diff --git a/doctr/io/pdf.py b/doctr/io/pdf.py index f48397d737..fcbbfcd3ee 100644 --- a/doctr/io/pdf.py +++ b/doctr/io/pdf.py @@ -3,6 +3,7 @@ # This program is licensed under the Apache License version 2. # See LICENSE or go to for full license details. +import os.path from pathlib import Path from typing import Any, List @@ -24,15 +25,18 @@ def read_pdf(file: AbstractFile, scale: float = 2, **kwargs: Any) -> List[np.nda Args: file: the path to the PDF file scale: rendering scale (1 corresponds to 72dpi) + kwargs: additional parameters to :func:`pypdfium2._helpers.pdf_renderer.render_pdf_topil` Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 + the list of pages decoded as numpy ndarray of shape H x W x C """ - if not isinstance(file, (str, Path, bytes)): + if isinstance(file, Path): + file = str(file) + if not isinstance(file, (str, bytes)): raise TypeError("unsupported object type for argument 'file'") - if isinstance(file, (str, Path)) and not Path(file).is_file(): + if isinstance(file, str) and not os.path.isfile(file): raise FileNotFoundError(f"unable to access {file}") - # Read pages with fitz and convert them to numpy ndarrays - return [np.asarray(img) for img, _ in pdfium.render_pdf(file, scale=scale)] + # Rasterise pages to PIL images with pypdfium2 and convert to numpy ndarrays + return [np.asarray(img) for img, _ in pdfium.render_pdf_topil(file, scale=scale, **kwargs)] diff --git a/requirements-pt.txt b/requirements-pt.txt index 1728818570..9aa8aab34b 100644 --- a/requirements-pt.txt +++ b/requirements-pt.txt @@ -2,7 +2,7 @@ numpy>=1.16.0 scipy>=1.4.0 h5py>=3.1.0 opencv-python>=3.4.5.20 -pypdfium2>=0.14.0 +pypdfium2>=1.0.0 pyclipper>=1.2.0 shapely>=1.6.0 matplotlib>=3.1.0,<3.4.3 diff --git a/requirements.txt b/requirements.txt index 67c06b4e74..4a70a89913 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy>=1.16.0 scipy>=1.4.0 h5py>=3.1.0 opencv-python>=3.4.5.20 -pypdfium2>=0.14.0 +pypdfium2>=1.0.0 pyclipper>=1.2.0 shapely>=1.6.0 matplotlib>=3.1.0,<3.4.3 diff --git a/setup.py b/setup.py index d1290c3580..565f883daa 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "h5py>=3.1.0", "opencv-python>=3.4.5.20", "tensorflow>=2.4.0", - "pypdfium2>=0.14.0", + "pypdfium2>=1.0.0", "pyclipper>=1.2.0", "shapely>=1.6.0", "matplotlib>=3.1.0,<3.4.3",