From 2d9415cdd09fadb3c024c314db0684a14eeb3f98 Mon Sep 17 00:00:00 2001 From: Samkit Jain <15127115+samkit-jain@users.noreply.github.com> Date: Sun, 29 Nov 2020 15:41:01 +0530 Subject: [PATCH] Recursively parse metadata values to handle nested `PDFObjRef` objects Fixes #316 --- pdfplumber/pdf.py | 13 ++----------- pdfplumber/utils.py | 18 ++++++++++++++++++ tests/test_issues.py | 2 +- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 06ffb9d2..a0964617 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -1,6 +1,6 @@ from .container import Container from .page import Page -from .utils import decode_text +from .utils import resolve_and_decode import logging import pathlib @@ -31,16 +31,7 @@ def __init__(self, stream, pages=None, laparams=None, precision=0.001, password= self.metadata.update(info) for k, v in self.metadata.items(): try: - if hasattr(v, "resolve"): - v = v.resolve() - if type(v) == list: - self.metadata[k] = list(map(decode_text, v)) - elif isinstance(v, PSLiteral): - self.metadata[k] = decode_text(v.name) - elif isinstance(v, (str, bytes)): - self.metadata[k] = decode_text(v) - else: - self.metadata[k] = v + self.metadata[k] = resolve_and_decode(v) except Exception as e: # This metadata value could not be parsed. Instead of failing the PDF read, # treat it as a warning. diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py index f71d3c6c..05a4fa56 100644 --- a/pdfplumber/utils.py +++ b/pdfplumber/utils.py @@ -78,6 +78,24 @@ def decode_text(s): return "".join(PDFDocEncoding[o] for o in ords) +def resolve_and_decode(obj): + """Recursively resolve the metadata values.""" + if hasattr(obj, "resolve"): + obj = obj.resolve() + if isinstance(obj, list): + return list(map(resolve_and_decode, obj)) + elif isinstance(obj, PSLiteral): + return decode_text(obj.name) + elif isinstance(obj, (str, bytes)): + return decode_text(obj) + elif isinstance(obj, dict): + for k, v in obj.items(): + obj[k] = resolve_and_decode(v) + return obj + + return obj + + def decode_psl_list(_list): return [ decode_text(value.name) if isinstance(value, PSLiteral) else value diff --git a/tests/test_issues.py b/tests/test_issues.py index 886cbfc8..5288d2b7 100644 --- a/tests/test_issues.py +++ b/tests/test_issues.py @@ -175,4 +175,4 @@ def test_issue_316(self): """ path = os.path.join(HERE, "pdfs/issue-316-example.pdf") with pdfplumber.open(path) as pdf: - assert pdf.metadata + assert pdf.metadata["Changes"][0]["CreationDate"] == "D:20061207105020Z00'00'"