fix: complete coverage and fix handling of OBJR/MCR

jsvine · Sep 6, 2023 · 14f9a67 · 14f9a67
1 parent 8d485c3
commit 14f9a67
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 57 deletions.
diff --git a/pdfplumber/cli.py b/pdfplumber/cli.py
@@ -96,11 +96,11 @@ def main(args_raw: List[str] = sys.argv[1:]) -> None:
 
     with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
         if args.structure:
-            json.dump(pdf.structure_tree, sys.stdout, indent=args.indent)
+            print(json.dumps(pdf.structure_tree, indent=args.indent))
         elif args.structure_text:
             tree = pdf.structure_tree
             add_text_to_mcids(pdf, tree)
-            json.dump(tree, sys.stdout, indent=args.indent, ensure_ascii=False)
+            print(json.dumps(tree, indent=args.indent, ensure_ascii=False))
         elif args.format == "csv":
             pdf.to_csv(
                 sys.stdout,

diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py
@@ -1,3 +1,4 @@
+import logging
 from collections import deque
 from dataclasses import asdict, dataclass, field
 from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
@@ -10,6 +11,9 @@
 
 from .utils import decode_text
 
+logger = logging.getLogger(__name__)
+
+
 if TYPE_CHECKING:  # pragma: nocover
     from .page import Page
     from .pdf import PDF
@@ -116,18 +120,20 @@ def _make_attributes(
             if key not in obj:
                 continue
             attr_obj = resolve1(obj[key])
+            # It could be a list of attribute objects (why?)
             if isinstance(attr_obj, list):
                 attr_obj_list.extend(attr_obj)
             else:
                 attr_obj_list.append(attr_obj)
         attr_objs = []
         prev_obj = None
         for aref in attr_obj_list:
-            # If we find a revision number, which might "follow
-            # the revision object" (the spec is incredibly unclear
-            # about how this actually works), then use it to
-            # decide whether to take the previous object...
-            if isinstance(aref, int):  # pragma: nocover
+            # If we find a revision number, which might "follow the
+            # revision object" (the spec is not clear about what this
+            # should look like but it implies they are simply adjacent
+            # in a flat array), then use it to decide whether to take
+            # the previous object...
+            if isinstance(aref, int):
                 if aref == revision and prev_obj is not None:
                     attr_objs.append(prev_obj)
                 prev_obj = None
@@ -137,14 +143,15 @@ def _make_attributes(
                 prev_obj = resolve1(aref)
         if prev_obj is not None:
             attr_objs.append(prev_obj)
-        # Now merge all the relevant ones to a single set (FIXME: Not
-        # *really* sure this is how this is supposed to work... OMG)
+        # Now merge all the attribute objects in the collected to a
+        # single set (again, the spec doesn't really explain this but
+        # does say that attributes in /A supersede those in /C)
         attr = {}
         for obj in attr_objs:
-            if isinstance(obj, PSLiteral):  # OMG
+            if isinstance(obj, PSLiteral):
                 key = decode_text(obj.name)
-                # Should be a warning at least!
-                if key not in self.class_map:  # pragma: nocover
+                if key not in self.class_map:
+                    logger.warning("Unknown attribute class %s", key)
                     continue
                 obj = self.class_map[key]
             for k, v in obj.items():
@@ -230,30 +237,37 @@ def _parse_parent_tree(self, parent_array: List[Any]) -> None:
         assert found_root
         self._resolve_children(s)
 
+    def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
+        if "Pg" not in obj:
+            return True
+        page_objid = obj["Pg"].objid
+        if self.page_dict is not None:
+            return page_objid in self.page_dict
+        if self.page is not None:
+            # We have to do this to satisfy mypy
+            if page_objid != self.page.pageid:
+                return False
+        return True
+
     def _parse_struct_tree(self) -> None:
         """Populate the structure tree starting from the root, skipping
         unparsed pages and empty elements."""
         root = resolve1(self.root["K"])
 
-        def on_parsed_page(obj: Dict[str, Any]) -> bool:
-            if self.page_dict is not None and "Pg" in obj:  # pragma: nocover
-                page_objid = obj["Pg"].objid
-                return page_objid in self.page_dict
-            return True
-
         # It could just be a single object ... it's in the spec (argh)
         if isinstance(root, dict):
             root = [self.root["K"]]
         d = deque(root)
         s = {}
         while d:
             ref = d.popleft()
-            if repr(ref) in s:
-                continue  # pragma: nocover
+            # In case the tree is actually a DAG and not a tree...
+            if repr(ref) in s:  # pragma: nocover (shouldn't happen)
+                continue
             obj = resolve1(ref)
             # Deref top-level OBJR skipping refs to unparsed pages
-            if isinstance(obj, dict) and "Obj" in obj:  # pragma: nocover
-                if not on_parsed_page(obj):
+            if isinstance(obj, dict) and "Obj" in obj:
+                if not self.on_parsed_page(obj):
                     continue
                 ref = obj["Obj"]
                 obj = resolve1(ref)
@@ -265,30 +279,25 @@ def on_parsed_page(obj: Dict[str, Any]) -> bool:
                 if isinstance(child, PDFObjRef):
                     d.append(child)
                 elif isinstance(child, dict) and "Obj" in child:
-                    if on_parsed_page(child):
+                    if self.on_parsed_page(child):
                         d.append(child["Obj"])
 
         # Traverse depth-first, removing empty elements (unsure how to
         # do this non-recursively)
         def prune(elements: List[Any]) -> List[Any]:
             next_elements = []
             for ref in elements:
+                obj = resolve1(ref)
                 if isinstance(ref, int):
                     next_elements.append(ref)
                     continue
-                elif isinstance(ref, dict):
-                    if not on_parsed_page(ref):  # pragma: nocover
+                elif isinstance(obj, dict):
+                    if not self.on_parsed_page(obj):
                         continue
-                    if "MCID" in ref:  # pragma: nocover
-                        next_elements.append(ref["MCID"])
+                    if "MCID" in obj:
+                        next_elements.append(obj["MCID"])
                         continue
-                    elif "Obj" in ref:
-                        ref = ref["Obj"]
-                elif isinstance(ref, PDFObjRef):
-                    obj = resolve1(ref)
-                    if isinstance(obj, dict) and "Obj" in obj:  # pragma: nocover
-                        if not on_parsed_page(obj):
-                            continue
+                    elif "Obj" in obj:
                         ref = obj["Obj"]
                 element, children = s[repr(ref)]
                 children = prune(children)
@@ -311,25 +320,30 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None:
         # It could just be a single object ... it's in the spec (argh)
         if isinstance(root, dict):
             root = [self.root["K"]]
-        d = deque(root)
+        self.children = []
+        # Create top-level self.children
+        parsed_root = []
+        for ref in root:
+            obj = resolve1(ref)
+            if isinstance(obj, dict) and "Obj" in obj:
+                if not self.on_parsed_page(obj):
+                    continue
+                ref = obj["Obj"]
+            if repr(ref) in seen:
+                parsed_root.append(ref)
+        d = deque(parsed_root)
         while d:
             ref = d.popleft()
-            # The pruning (or parent tree construction) done above
-            # should ensure we never encounter dangling references,
-            # *but* you never know (should emit warnings...)
-            if repr(ref) not in seen:  # pragma: nocover
-                continue
             element, children = seen[repr(ref)]
             assert element is not None, "Unparsed element"
             for child in children:
                 if isinstance(child, int):
                     element.mcids.append(child)
                 elif isinstance(child, dict):
-                    # Skip out-of-page MCIDS (which are obviously wrong!) and OBJRs
-                    if "Pg" in child and self.page is not None:  # pragma: nocover
-                        if child["Pg"].objid != self.page.pageid:
-                            continue
-                    if "MCID" in child:  # pragma: nocover
+                    # Skip out-of-page MCIDS and OBJRs
+                    if not self.on_parsed_page(child):
+                        continue
+                    if "MCID" in child:
                         element.mcids.append(child["MCID"])
                     elif "Obj" in child:
                         child = child["Obj"]
@@ -339,7 +353,7 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None:
                     if child_element is not None:
                         element.children.append(child_element)
                         d.append(child)
-        self.children = [seen[repr(ref)][0] for ref in root if repr(ref) in seen]
+        self.children = [seen[repr(ref)][0] for ref in parsed_root]
 
     def __iter__(self) -> Iterator[PDFStructElement]:
         return iter(self.children)
diff --git a/tests/pdfs/hello_structure.pdf b/tests/pdfs/hello_structure.pdf
diff --git a/tests/pdfs/make_xref.py b/tests/pdfs/make_xref.py
@@ -7,7 +7,7 @@
 import re
 import sys
 
-with open(sys.argv[1], "rb") as infh:
+with open(sys.argv[1], "r+b") as infh:
     pos = 0
     xref = [(0, 65535, "f")]
     for spam in infh:
@@ -17,11 +17,12 @@
         elif text.strip() == "xref":
             startxref = pos
         pos = infh.tell()
-    print("xref")
-    print("0", len(xref))
+    infh.seek(startxref)
+    infh.write(b"xref\n")
+    infh.write(("0 %d\n" % len(xref)).encode("ascii"))
     for x in xref:
-        print("%010d %05d %s " % x)
-    print("trailer  << /Size %d /Root 1 0 R >>" % len(xref))
-    print("startxref")
-    print(startxref)
-    print("%%EOF")
+        infh.write(("%010d %05d %s \n" % x).encode("ascii"))
+    infh.write(("trailer  << /Size %d /Root 1 0 R >>\n" % len(xref)).encode("ascii"))
+    infh.write(b"startxref\n")
+    infh.write(("%d\n" % startxref).encode("ascii"))
+    infh.write(b"%%EOF\n")
diff --git a/tests/test_structure.py b/tests/test_structure.py
@@ -803,7 +803,8 @@ def test_structure_tree(self):
 
 HELLO = [
     {
-        "type": "Document",
+        "type": "Section",
+        "page_number": 1,
         "children": [
             {
                 "type": "P",
@@ -818,6 +819,33 @@ def test_structure_tree(self):
                 "mcids": [1],
             },
         ],
+    },
+    {
+        "type": "P",
+        "revision": 1,
+        "page_number": 2,
+        "attributes": {"O": "Foo", "A1": 3, "A2": 3},
+        "mcids": [2],
+    },
+]
+HELLO1 = [
+    {
+        "type": "Section",
+        "page_number": 1,
+        "children": [
+            {
+                "type": "P",
+                "page_number": 1,
+                "attributes": {"O": "Foo", "A1": 1},
+                "mcids": [1],
+            },
+        ],
+    }
+]
+HELLO1P = [
+    {
+        "type": "Section",
+        "children": [{"type": "P", "attributes": {"O": "Foo", "A1": 1}, "mcids": [1]}],
     }
 ]
 
@@ -929,5 +957,8 @@ def test_chelsea_pdta(self):
     def test_hello_structure(self):
         # Synthetic PDF to test some corner cases
         path = os.path.join(HERE, "pdfs/hello_structure.pdf")
-        pdf = pdfplumber.open(path)
-        assert pdf.structure_tree == HELLO
+        with pdfplumber.open(path) as pdf:
+            assert pdf.structure_tree == HELLO
+            assert pdf.pages[0].structure_tree == HELLO1P
+        with pdfplumber.open(path, pages=[1]) as pdf:
+            assert pdf.structure_tree == HELLO1