Skip to content

Commit

Permalink
Recursively parse metadata values to handle nested PDFObjRef objects
Browse files Browse the repository at this point in the history
Fixes #316
  • Loading branch information
samkit-jain committed Nov 29, 2020
1 parent 9f0a1ad commit 2d9415c
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 12 deletions.
13 changes: 2 additions & 11 deletions pdfplumber/pdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .container import Container
from .page import Page
from .utils import decode_text
from .utils import resolve_and_decode

import logging
import pathlib
Expand Down Expand Up @@ -31,16 +31,7 @@ def __init__(self, stream, pages=None, laparams=None, precision=0.001, password=
self.metadata.update(info)
for k, v in self.metadata.items():
try:
if hasattr(v, "resolve"):
v = v.resolve()
if type(v) == list:
self.metadata[k] = list(map(decode_text, v))
elif isinstance(v, PSLiteral):
self.metadata[k] = decode_text(v.name)
elif isinstance(v, (str, bytes)):
self.metadata[k] = decode_text(v)
else:
self.metadata[k] = v
self.metadata[k] = resolve_and_decode(v)
except Exception as e:
# This metadata value could not be parsed. Instead of failing the PDF read,
# treat it as a warning.
Expand Down
18 changes: 18 additions & 0 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,24 @@ def decode_text(s):
return "".join(PDFDocEncoding[o] for o in ords)


def resolve_and_decode(obj):
"""Recursively resolve the metadata values."""
if hasattr(obj, "resolve"):
obj = obj.resolve()
if isinstance(obj, list):
return list(map(resolve_and_decode, obj))
elif isinstance(obj, PSLiteral):
return decode_text(obj.name)
elif isinstance(obj, (str, bytes)):
return decode_text(obj)
elif isinstance(obj, dict):
for k, v in obj.items():
obj[k] = resolve_and_decode(v)
return obj

return obj


def decode_psl_list(_list):
return [
decode_text(value.name) if isinstance(value, PSLiteral) else value
Expand Down
2 changes: 1 addition & 1 deletion tests/test_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,4 @@ def test_issue_316(self):
"""
path = os.path.join(HERE, "pdfs/issue-316-example.pdf")
with pdfplumber.open(path) as pdf:
assert pdf.metadata
assert pdf.metadata["Changes"][0]["CreationDate"] == "D:20061207105020Z00'00'"

0 comments on commit 2d9415c

Please sign in to comment.