Skip to content

Commit

Permalink
Treat invalid/unparseable metadata values as warnings
Browse files Browse the repository at this point in the history
Certain invalid values if parseable don't throw a warning and only unparseable (always invalid) throw
  • Loading branch information
samkit-jain committed Nov 26, 2020
1 parent d3b84da commit 9f0a1ad
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
28 changes: 18 additions & 10 deletions pdfplumber/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .page import Page
from .utils import decode_text

import logging
import pathlib
import itertools
from pdfminer.pdfparser import PDFParser
Expand All @@ -12,6 +13,8 @@
from pdfminer.converter import PDFPageAggregator
from pdfminer.psparser import PSLiteral

logger = logging.getLogger(__name__)


class PDF(Container):
cached_properties = Container.cached_properties + ["_pages"]
Expand All @@ -27,16 +30,21 @@ def __init__(self, stream, pages=None, laparams=None, precision=0.001, password=
for info in self.doc.info:
self.metadata.update(info)
for k, v in self.metadata.items():
if hasattr(v, "resolve"):
v = v.resolve()
if type(v) == list:
self.metadata[k] = list(map(decode_text, v))
elif isinstance(v, PSLiteral):
self.metadata[k] = decode_text(v.name)
elif isinstance(v, (str, bytes)):
self.metadata[k] = decode_text(v)
else:
self.metadata[k] = v
try:
if hasattr(v, "resolve"):
v = v.resolve()
if type(v) == list:
self.metadata[k] = list(map(decode_text, v))
elif isinstance(v, PSLiteral):
self.metadata[k] = decode_text(v.name)
elif isinstance(v, (str, bytes)):
self.metadata[k] = decode_text(v)
else:
self.metadata[k] = v
except Exception as e:
# This metadata value could not be parsed. Instead of failing the PDF read,
# treat it as a warning.
logger.warning(f'[WARNING] Metadata key "{k}" could not be parsed due to exception: {str(e)}')
self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

Expand Down
Binary file added tests/pdfs/issue-316-example.pdf
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/test_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,11 @@ def test_issue_297(self):
path = os.path.join(HERE, "pdfs/issue-297-example.pdf")
with pdfplumber.open(path) as pdf:
assert isinstance(pdf.metadata["Copies"], int)

def test_issue_316(self):
"""
Handle invalid metadata
"""
path = os.path.join(HERE, "pdfs/issue-316-example.pdf")
with pdfplumber.open(path) as pdf:
assert pdf.metadata

0 comments on commit 9f0a1ad

Please sign in to comment.