Skip to content

Commit

Permalink
fix: properly exclude content from unparsed pages
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Aug 18, 2023
1 parent f765351 commit 5264d2a
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 21 deletions.
47 changes: 27 additions & 20 deletions pdfplumber/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,10 @@ def _parse_struct_tree(self) -> None:
root = resolve1(self.root["K"])

def on_parsed_page(obj: Dict[str, Any]) -> bool:
if self.page_dict is not None and "Pg" in obj:
if self.page_dict is not None and "Pg" in obj: # pragma: nocover
page_objid = obj["Pg"].objid
return page_objid in self.page_dict
return True # pragma: nocover
return True

# It could just be a single object ... it's in the spec (argh)
if isinstance(root, dict):
Expand All @@ -251,8 +251,7 @@ def on_parsed_page(obj: Dict[str, Any]) -> bool:
if repr(ref) in s:
continue # pragma: nocover
obj = resolve1(ref)
# Dereference OBJR right away, skipping objects on
# unparsed pages
# Deref top-level OBJR skipping refs to unparsed pages
if isinstance(obj, dict) and "Obj" in obj: # pragma: nocover
if not on_parsed_page(obj):
continue
Expand All @@ -261,12 +260,12 @@ def on_parsed_page(obj: Dict[str, Any]) -> bool:
element, children = self._make_element(obj)
# Similar to above, delay resolving the children to avoid
# tree-recursion.
s[repr(ref)] = self._make_element(obj)
s[repr(ref)] = element, children
for child in children:
if isinstance(child, PDFObjRef):
d.append(child)
elif isinstance(child, dict) and "Obj" in child:
if on_parsed_page(obj):
if on_parsed_page(child):
d.append(child["Obj"])

# Traverse depth-first, removing empty elements (unsure how to
Expand All @@ -277,12 +276,20 @@ def prune(elements: List[Any]) -> List[Any]:
if isinstance(ref, int):
next_elements.append(ref)
continue
if isinstance(ref, dict):
if "MCID" in ref: # type MCR
elif isinstance(ref, dict):
if not on_parsed_page(ref): # pragma: nocover
continue
if "MCID" in ref: # pragma: nocover
next_elements.append(ref["MCID"])
continue
elif "Obj" in ref: # type OBJR
elif "Obj" in ref:
ref = ref["Obj"]
elif isinstance(ref, PDFObjRef):
obj = resolve1(ref)
if isinstance(obj, dict) and "Obj" in obj: # pragma: nocover
if not on_parsed_page(obj):
continue
ref = obj["Obj"]
element, children = s[repr(ref)]
children = prune(children)
# See assertions below
Expand Down Expand Up @@ -315,19 +322,19 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None:
element, children = seen[repr(ref)]
assert element is not None, "Unparsed element"
for child in children:
if isinstance(child, dict): # Ugh...
# Deal with MCR and OBJR dictionaries
if "MCID" in child: # pragma: nocover
child = child["MCID"]
elif "Obj" in child: # type OBJR
# If we are a single page, skip out-of-page refs
if "Pg" in child and self.page is not None: # pragma: nocover
if child["Pg"].objid != self.page.pageid:
continue
child = child["Obj"]
if isinstance(child, int):
element.mcids.append(child)
elif isinstance(child, PDFObjRef):
elif isinstance(child, dict):
# Skip out-of-page MCIDS (which are obviously wrong!) and OBJRs
if "Pg" in child and self.page is not None: # pragma: nocover
if child["Pg"].objid != self.page.pageid:
continue
if "MCID" in child: # pragma: nocover
element.mcids.append(child["MCID"])
elif "Obj" in child:
child = child["Obj"]
# NOTE: if, not elif, in case of OBJR above
if isinstance(child, PDFObjRef):
child_element, _ = seen.get(repr(child), (None, None))
if child_element is not None:
element.children.append(child_element)
Expand Down
6 changes: 5 additions & 1 deletion tests/test_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,9 @@ def test_chelsea_pdta(self):
# This page has no structure tree (really!)
tree8 = pdf.pages[7].structure_tree
assert tree8 == []
# We should also have no structure tree here
with pdfplumber.open(path, pages=[8]) as pdf8:
assert pdf8.structure_tree == []
# This page is empty
tree3 = pdf.pages[3].structure_tree
assert tree3 == []
Expand All @@ -891,12 +894,13 @@ def test_chelsea_pdta(self):
pdf = pdfplumber.open(path, pages=[3])
tree2 = pdf.structure_tree
assert tree2
# Compare (mostly). FIXME: ignores weird extra MCID 0 in tree2
# Compare modulo page_number
d = deque(zip(tree1, tree2))
while d:
el1, el2 = d.popleft()
if "page_number" in el1:
assert el1["page_number"] == 3
assert el1 == el2
if "children" in el1:
assert len(el1["children"]) == len(el2["children"])
d.extend(zip(el1["children"], el2["children"]))

0 comments on commit 5264d2a

Please sign in to comment.