From 00e789bf42f73319a86610a77b88499499f2905c Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Sun, 26 Jul 2020 15:27:51 -0400 Subject: [PATCH] Shift to making pdfplumber.open the sole loader - .from_path is now removed - .load is marked as deprecated, to be removed in 0.6.0 --- README.md | 9 +++++---- pdfplumber/__init__.py | 18 +++++++++++++----- pdfplumber/cli.py | 2 +- pdfplumber/pdf.py | 8 ++++++-- tests/test_basics.py | 23 +++++++++++++---------- tests/test_ca_warn_report.py | 9 +++++++-- tests/test_display.py | 7 ++++++- tests/test_issues.py | 18 ++++++++++++------ tests/test_la_precinct_bulletin.py | 10 +++++++--- tests/test_list_metadata.py | 3 ++- tests/test_nics_report.py | 10 +++++++--- 11 files changed, 79 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 91b22c35..94df63f7 100644 --- a/README.md +++ b/README.md @@ -59,12 +59,13 @@ with pdfplumber.open("path/to/file.pdf") as pdf: ### Loading a PDF -`pdfplumber` provides two main ways to load a PDF: +To start working with a PDF, call `pdfplumber.open(x)`, where `x` can be a: -- `pdfplumber.open("path/to/file.pdf")` -- `pdfplumber.load(file_like_object)` +- path to your PDF file +- file object, loaded as bytes +- file-like object, loaded as bytes -Both methods return an instance of the `pdfplumber.PDF` class. +The `open` method returns an instance of the `pdfplumber.PDF` class. To load a password-protected PDF, pass the `password` keyword argument, e.g., `pdfplumber.open("file.pdf", password = "test")`. diff --git a/pdfplumber/__init__.py b/pdfplumber/__init__.py index a6e66b0f..6ef7deaa 100644 --- a/pdfplumber/__init__.py +++ b/pdfplumber/__init__.py @@ -1,19 +1,27 @@ +__all__ = [ + "__version__", + "utils", + "pdfminer", + "open", + "set_debug", +] + +from ._version import __version__ from .pdf import PDF from . import utils import pdfminer import pdfminer.pdftypes -from ._version import __version__ +import sys pdfminer.pdftypes.STRICT = False pdfminer.pdfinterp.STRICT = False +open = PDF.open + def load(file_or_buffer, **kwargs): + sys.stderr.write("Warning: pdfplumber.load is deprecated. Please use pdfplumber.open (with same arguments) instead.\n") return PDF(file_or_buffer, **kwargs) -open = PDF.open -# Old idiom -from_path = PDF.open - def set_debug(debug=0): pdfminer.debug = debug diff --git a/pdfplumber/cli.py b/pdfplumber/cli.py index 7c6b5c2d..f9a69bd7 100755 --- a/pdfplumber/cli.py +++ b/pdfplumber/cli.py @@ -93,7 +93,7 @@ def get_page_data(page): def main(): args = parse_args() - pdf = pdfplumber.load(args.infile, pages=args.pages) + pdf = pdfplumber.open(args.infile, pages=args.pages) if args.format == "csv": to_csv(pdf, args.types, args.encoding) else: diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 72b4a8cb..2a2241c7 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -2,6 +2,7 @@ from .page import Page from .utils import decode_text +import pathlib from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -44,8 +45,11 @@ def __init__(self, self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) @classmethod - def open(cls, path, **kwargs): - return cls(open(path, "rb"), **kwargs) + def open(cls, path_or_fp, **kwargs): + if isinstance(path_or_fp, (str, pathlib.Path)): + return cls(open(path_or_fp, "rb"), **kwargs) + else: + return cls(path_or_fp, **kwargs) def process_page(self, page): self.interpreter.process_page(page) diff --git a/tests/test_basics.py b/tests/test_basics.py index 364ce205..72230d91 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -12,9 +12,14 @@ class Test(unittest.TestCase): - def setUp(self): + @classmethod + def setup_class(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") - self.pdf = pdfplumber.from_path(path) + self.pdf = pdfplumber.open(path) + + @classmethod + def teardown_class(self): + self.pdf.close() def test_metadata(self): metadata = self.pdf.metadata @@ -39,17 +44,15 @@ def test(obj): assert(len(step_2.rects) == 0) def test_rotation(self): - rotated = pdfplumber.from_path( - os.path.join(HERE, "pdfs/nics-background-checks-2015-11-rotated.pdf") - ) assert(self.pdf.pages[0].width == 1008) assert(self.pdf.pages[0].height == 612) + path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11-rotated.pdf") + with pdfplumber.open(path) as rotated: + assert(rotated.pages[0].width == 612) + assert(rotated.pages[0].height == 1008) - assert(rotated.pages[0].width == 612) - assert(rotated.pages[0].height == 1008) - - assert(rotated.pages[0].cropbox == self.pdf.pages[0].cropbox) - assert(rotated.pages[0].bbox != self.pdf.pages[0].bbox) + assert(rotated.pages[0].cropbox == self.pdf.pages[0].cropbox) + assert(rotated.pages[0].bbox != self.pdf.pages[0].bbox) def test_password(self): path = os.path.join(HERE, "pdfs/password-example.pdf") diff --git a/tests/test_ca_warn_report.py b/tests/test_ca_warn_report.py index 3937e6cc..6884c219 100644 --- a/tests/test_ca_warn_report.py +++ b/tests/test_ca_warn_report.py @@ -16,11 +16,16 @@ def fix_row_spaces(row): class Test(unittest.TestCase): - def setUp(self): + @classmethod + def setup_class(self): path = os.path.join(HERE, "pdfs/WARN-Report-for-7-1-2015-to-03-25-2016.pdf") - self.pdf = pdfplumber.from_path(path) + self.pdf = pdfplumber.open(path) self.PDF_WIDTH = self.pdf.pages[0].width + @classmethod + def teardown_class(self): + self.pdf.close() + def test_pandas(self): rect_x0_clusters = utils.cluster_list([ r["x0"] diff --git a/tests/test_display.py b/tests/test_display.py index 4b5b6667..2a015b41 100644 --- a/tests/test_display.py +++ b/tests/test_display.py @@ -11,11 +11,16 @@ class Test(unittest.TestCase): - def setUp(self): + @classmethod + def setup_class(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") self.pdf = pdfplumber.open(path) self.im = self.pdf.pages[0].to_image() + @classmethod + def teardown_class(self): + self.pdf.close() + def test_basic_conversion(self): self.im.reset() self.im.draw_rect(self.im.page.rects[0]) diff --git a/tests/test_issues.py b/tests/test_issues.py index a681138c..a334bc5a 100644 --- a/tests/test_issues.py +++ b/tests/test_issues.py @@ -16,7 +16,7 @@ def test_issue_13(self): """ Test slightly simplified from gist here: https://github.com/jsvine/pdfplumber/issues/13 """ - pdf = pdfplumber.from_path( + pdf = pdfplumber.open( os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf") ) @@ -72,36 +72,42 @@ def determine_if_checked(checkbox, curve_list): for rect in rects ]) assert(n_checked == 5) + pdf.close() def test_issue_14(self): - pdf = pdfplumber.from_path( + pdf = pdfplumber.open( os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf") ) assert len(pdf.objects) + pdf.close() def test_issue_21(self): - pdf = pdfplumber.from_path( + pdf = pdfplumber.open( os.path.join(HERE, "pdfs/150109DSP-Milw-505-90D.pdf") ) assert len(pdf.objects) + pdf.close() def test_issue_33(self): - pdf = pdfplumber.from_path( + pdf = pdfplumber.open( os.path.join(HERE, "pdfs/issue-33-lorem-ipsum.pdf") ) assert len(pdf.metadata.keys()) + pdf.close() def test_issue_53(self): - pdf = pdfplumber.from_path( + pdf = pdfplumber.open( os.path.join(HERE, "pdfs/issue-53-example.pdf") ) assert len(pdf.objects) + pdf.close() def test_issue_67(self): - pdf = pdfplumber.from_path( + pdf = pdfplumber.open( os.path.join(HERE, "pdfs/issue-67-example.pdf") ) assert len(pdf.metadata.keys()) + pdf.close() def test_pr_77(self): # via https://github.com/jsvine/pdfplumber/pull/77 diff --git a/tests/test_la_precinct_bulletin.py b/tests/test_la_precinct_bulletin.py index b0f4b1ba..c4ddfa02 100644 --- a/tests/test_la_precinct_bulletin.py +++ b/tests/test_la_precinct_bulletin.py @@ -100,12 +100,16 @@ def to_dict(self): } class Test(unittest.TestCase): - - def setUp(self): + @classmethod + def setup_class(self): path = os.path.join(HERE, "pdfs/la-precinct-bulletin-2014-p1.pdf") - self.pdf = pdfplumber.from_path(path) + self.pdf = pdfplumber.open(path) self.PDF_WIDTH = self.pdf.pages[0].width + @classmethod + def teardown_class(self): + self.pdf.close() + def test_pandas(self): p1 = PrecinctPage(self.pdf.pages[0]).to_dict() assert(p1["registered_voters"] == 1100) diff --git a/tests/test_list_metadata.py b/tests/test_list_metadata.py index f2cd9c71..54ceb3b4 100644 --- a/tests/test_list_metadata.py +++ b/tests/test_list_metadata.py @@ -12,4 +12,5 @@ class Test(unittest.TestCase): def test_load(self): path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf") - pdf = pdfplumber.from_path(path) + with pdfplumber.open(path) as pdf: + assert len(pdf.metadata) diff --git a/tests/test_nics_report.py b/tests/test_nics_report.py index 2b51a43f..3d065e75 100644 --- a/tests/test_nics_report.py +++ b/tests/test_nics_report.py @@ -40,12 +40,16 @@ ] class Test(unittest.TestCase): - - def setUp(self): + @classmethod + def setup_class(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") - self.pdf = pdfplumber.from_path(path) + self.pdf = pdfplumber.open(path) self.PDF_WIDTH = self.pdf.pages[0].width + @classmethod + def teardown_class(self): + self.pdf.close() + def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485))