Skip to content

Commit

Permalink
fix: complete coverage and fix handling of OBJR/MCR
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Sep 6, 2023
1 parent 8d485c3 commit 14f9a67
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 57 deletions.
4 changes: 2 additions & 2 deletions pdfplumber/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ def main(args_raw: List[str] = sys.argv[1:]) -> None:

with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
if args.structure:
json.dump(pdf.structure_tree, sys.stdout, indent=args.indent)
print(json.dumps(pdf.structure_tree, indent=args.indent))
elif args.structure_text:
tree = pdf.structure_tree
add_text_to_mcids(pdf, tree)
json.dump(tree, sys.stdout, indent=args.indent, ensure_ascii=False)
print(json.dumps(tree, indent=args.indent, ensure_ascii=False))
elif args.format == "csv":
pdf.to_csv(
sys.stdout,
Expand Down
102 changes: 58 additions & 44 deletions pdfplumber/structure.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from collections import deque
from dataclasses import asdict, dataclass, field
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
Expand All @@ -10,6 +11,9 @@

from .utils import decode_text

logger = logging.getLogger(__name__)


if TYPE_CHECKING: # pragma: nocover
from .page import Page
from .pdf import PDF
Expand Down Expand Up @@ -116,18 +120,20 @@ def _make_attributes(
if key not in obj:
continue
attr_obj = resolve1(obj[key])
# It could be a list of attribute objects (why?)
if isinstance(attr_obj, list):
attr_obj_list.extend(attr_obj)
else:
attr_obj_list.append(attr_obj)
attr_objs = []
prev_obj = None
for aref in attr_obj_list:
# If we find a revision number, which might "follow
# the revision object" (the spec is incredibly unclear
# about how this actually works), then use it to
# decide whether to take the previous object...
if isinstance(aref, int): # pragma: nocover
# If we find a revision number, which might "follow the
# revision object" (the spec is not clear about what this
# should look like but it implies they are simply adjacent
# in a flat array), then use it to decide whether to take
# the previous object...
if isinstance(aref, int):
if aref == revision and prev_obj is not None:
attr_objs.append(prev_obj)
prev_obj = None
Expand All @@ -137,14 +143,15 @@ def _make_attributes(
prev_obj = resolve1(aref)
if prev_obj is not None:
attr_objs.append(prev_obj)
# Now merge all the relevant ones to a single set (FIXME: Not
# *really* sure this is how this is supposed to work... OMG)
# Now merge all the attribute objects in the collected to a
# single set (again, the spec doesn't really explain this but
# does say that attributes in /A supersede those in /C)
attr = {}
for obj in attr_objs:
if isinstance(obj, PSLiteral): # OMG
if isinstance(obj, PSLiteral):
key = decode_text(obj.name)
# Should be a warning at least!
if key not in self.class_map: # pragma: nocover
if key not in self.class_map:
logger.warning("Unknown attribute class %s", key)
continue
obj = self.class_map[key]
for k, v in obj.items():
Expand Down Expand Up @@ -230,30 +237,37 @@ def _parse_parent_tree(self, parent_array: List[Any]) -> None:
assert found_root
self._resolve_children(s)

def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
if "Pg" not in obj:
return True
page_objid = obj["Pg"].objid
if self.page_dict is not None:
return page_objid in self.page_dict
if self.page is not None:
# We have to do this to satisfy mypy
if page_objid != self.page.pageid:
return False
return True

def _parse_struct_tree(self) -> None:
"""Populate the structure tree starting from the root, skipping
unparsed pages and empty elements."""
root = resolve1(self.root["K"])

def on_parsed_page(obj: Dict[str, Any]) -> bool:
if self.page_dict is not None and "Pg" in obj: # pragma: nocover
page_objid = obj["Pg"].objid
return page_objid in self.page_dict
return True

# It could just be a single object ... it's in the spec (argh)
if isinstance(root, dict):
root = [self.root["K"]]
d = deque(root)
s = {}
while d:
ref = d.popleft()
if repr(ref) in s:
continue # pragma: nocover
# In case the tree is actually a DAG and not a tree...
if repr(ref) in s: # pragma: nocover (shouldn't happen)
continue
obj = resolve1(ref)
# Deref top-level OBJR skipping refs to unparsed pages
if isinstance(obj, dict) and "Obj" in obj: # pragma: nocover
if not on_parsed_page(obj):
if isinstance(obj, dict) and "Obj" in obj:
if not self.on_parsed_page(obj):
continue
ref = obj["Obj"]
obj = resolve1(ref)
Expand All @@ -265,30 +279,25 @@ def on_parsed_page(obj: Dict[str, Any]) -> bool:
if isinstance(child, PDFObjRef):
d.append(child)
elif isinstance(child, dict) and "Obj" in child:
if on_parsed_page(child):
if self.on_parsed_page(child):
d.append(child["Obj"])

# Traverse depth-first, removing empty elements (unsure how to
# do this non-recursively)
def prune(elements: List[Any]) -> List[Any]:
next_elements = []
for ref in elements:
obj = resolve1(ref)
if isinstance(ref, int):
next_elements.append(ref)
continue
elif isinstance(ref, dict):
if not on_parsed_page(ref): # pragma: nocover
elif isinstance(obj, dict):
if not self.on_parsed_page(obj):
continue
if "MCID" in ref: # pragma: nocover
next_elements.append(ref["MCID"])
if "MCID" in obj:
next_elements.append(obj["MCID"])
continue
elif "Obj" in ref:
ref = ref["Obj"]
elif isinstance(ref, PDFObjRef):
obj = resolve1(ref)
if isinstance(obj, dict) and "Obj" in obj: # pragma: nocover
if not on_parsed_page(obj):
continue
elif "Obj" in obj:
ref = obj["Obj"]
element, children = s[repr(ref)]
children = prune(children)
Expand All @@ -311,25 +320,30 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None:
# It could just be a single object ... it's in the spec (argh)
if isinstance(root, dict):
root = [self.root["K"]]
d = deque(root)
self.children = []
# Create top-level self.children
parsed_root = []
for ref in root:
obj = resolve1(ref)
if isinstance(obj, dict) and "Obj" in obj:
if not self.on_parsed_page(obj):
continue
ref = obj["Obj"]
if repr(ref) in seen:
parsed_root.append(ref)
d = deque(parsed_root)
while d:
ref = d.popleft()
# The pruning (or parent tree construction) done above
# should ensure we never encounter dangling references,
# *but* you never know (should emit warnings...)
if repr(ref) not in seen: # pragma: nocover
continue
element, children = seen[repr(ref)]
assert element is not None, "Unparsed element"
for child in children:
if isinstance(child, int):
element.mcids.append(child)
elif isinstance(child, dict):
# Skip out-of-page MCIDS (which are obviously wrong!) and OBJRs
if "Pg" in child and self.page is not None: # pragma: nocover
if child["Pg"].objid != self.page.pageid:
continue
if "MCID" in child: # pragma: nocover
# Skip out-of-page MCIDS and OBJRs
if not self.on_parsed_page(child):
continue
if "MCID" in child:
element.mcids.append(child["MCID"])
elif "Obj" in child:
child = child["Obj"]
Expand All @@ -339,7 +353,7 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None:
if child_element is not None:
element.children.append(child_element)
d.append(child)
self.children = [seen[repr(ref)][0] for ref in root if repr(ref) in seen]
self.children = [seen[repr(ref)][0] for ref in parsed_root]

def __iter__(self) -> Iterator[PDFStructElement]:
return iter(self.children)
Binary file modified tests/pdfs/hello_structure.pdf
Binary file not shown.
17 changes: 9 additions & 8 deletions tests/pdfs/make_xref.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
import sys

with open(sys.argv[1], "rb") as infh:
with open(sys.argv[1], "r+b") as infh:
pos = 0
xref = [(0, 65535, "f")]
for spam in infh:
Expand All @@ -17,11 +17,12 @@
elif text.strip() == "xref":
startxref = pos
pos = infh.tell()
print("xref")
print("0", len(xref))
infh.seek(startxref)
infh.write(b"xref\n")
infh.write(("0 %d\n" % len(xref)).encode("ascii"))
for x in xref:
print("%010d %05d %s " % x)
print("trailer << /Size %d /Root 1 0 R >>" % len(xref))
print("startxref")
print(startxref)
print("%%EOF")
infh.write(("%010d %05d %s \n" % x).encode("ascii"))
infh.write(("trailer << /Size %d /Root 1 0 R >>\n" % len(xref)).encode("ascii"))
infh.write(b"startxref\n")
infh.write(("%d\n" % startxref).encode("ascii"))
infh.write(b"%%EOF\n")
37 changes: 34 additions & 3 deletions tests/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,8 @@ def test_structure_tree(self):

HELLO = [
{
"type": "Document",
"type": "Section",
"page_number": 1,
"children": [
{
"type": "P",
Expand All @@ -818,6 +819,33 @@ def test_structure_tree(self):
"mcids": [1],
},
],
},
{
"type": "P",
"revision": 1,
"page_number": 2,
"attributes": {"O": "Foo", "A1": 3, "A2": 3},
"mcids": [2],
},
]
HELLO1 = [
{
"type": "Section",
"page_number": 1,
"children": [
{
"type": "P",
"page_number": 1,
"attributes": {"O": "Foo", "A1": 1},
"mcids": [1],
},
],
}
]
HELLO1P = [
{
"type": "Section",
"children": [{"type": "P", "attributes": {"O": "Foo", "A1": 1}, "mcids": [1]}],
}
]

Expand Down Expand Up @@ -929,5 +957,8 @@ def test_chelsea_pdta(self):
def test_hello_structure(self):
# Synthetic PDF to test some corner cases
path = os.path.join(HERE, "pdfs/hello_structure.pdf")
pdf = pdfplumber.open(path)
assert pdf.structure_tree == HELLO
with pdfplumber.open(path) as pdf:
assert pdf.structure_tree == HELLO
assert pdf.pages[0].structure_tree == HELLO1P
with pdfplumber.open(path, pages=[1]) as pdf:
assert pdf.structure_tree == HELLO1

0 comments on commit 14f9a67

Please sign in to comment.