Merge pull request #963 from dhdaines/structure_tree

Support for PDF 1.3 logical structure
jsvine · Nov 9, 2023 · 35ed9e0 · 35ed9e0
2 parents ba58e16 + 036044d
commit 35ed9e0
Show file tree

Hide file tree

Showing 15 changed files with 1,595 additions and 2 deletions.
diff --git a/docs/structure.md b/docs/structure.md
@@ -0,0 +1,61 @@
+# Structure Tree
+
+Since PDF 1.3 it is possible for a PDF to contain logical structure,
+contained in a *structure tree*.  In conjunction with PDF 1.2 [marked
+content sections](#marked-content-sections) this forms the basis of
+Tagged PDF and other accessibility features.
+
+Unfortunately, since all of these standards are optional and variably
+implemented in PDF authoring tools, and are frequently not enabled by
+default, it is not possible to rely on them to extract the structure
+of a PDF and associated content.  Nonetheless they can be useful as
+features for a heuristic or machine-learning based system, or for
+extracting particular structures such as tables.
+
+Since `pdfplumber`'s API is page-based, the structure is available for
+a particular page, using the `structure_tree` attribute:
+
+    with pdfplumber.open(pdffile) as pdf:
+        for element in pdf.pages[0].structure_tree:
+             print(element["type"], element["mcids"])
+             for child in element.children:
+                 print(child["type"], child["mcids"])
+
+The `type` field contains the type of the structure element - the
+standard structure types can be seen in section 10.7.3 of [the PDF 1.7
+reference
+document](https://ghostscript.com/~robin/pdf_reference17.pdf#page=898),
+but usually they are rather HTML-like, if created by a recent PDF
+authoring tool (notably, older tools may simply produce `P` for
+everything).
+
+The `mcids` field contains the list of marked content section IDs
+corresponding to this element.
+
+The `lang` field is often present as well, and contains a language
+code for the text content, e.g. `"EN-US"` or `"FR-CA"`.
+
+The `alt_text` field will be present if the author has helpfully added
+alternate text to an image.  In some cases, `actual_text` may also be
+present.
+
+There are also various attributes that may be in the `attributes`
+field.  Some of these are quite useful indeed, such as ``BBox` which
+gives you the bounding box of a `Table`, `Figure`, or `Image`.  You
+can see a full list of these [in the PDF
+spec](https://ghostscript.com/~robin/pdf_reference17.pdf#page=916).
+Note that the `BBox` is in PDF coordinate space with the origin at the
+bottom left of the page.  To convert it to `pdfplumber`'s space you
+can do, for example:
+
+    x0, y0, x1, y1 = element['attributes']['BBox']
+    top = page.height - y1
+    bottom = page.height - y0
+    doctop = page.initial_doctop + top
+    bbox = (x0, top, x1, bottom)
+
+It is also possible to get the structure tree for the entire document.
+In this case, because marked content IDs are specific to a given page,
+each element will also have a `page_number` attribute, which is the
+number of the page containing (partially or completely) this element,
+indexed from 1 (for consistency with `pdfplumber.Page`).
diff --git a/pdfplumber/cli.py b/pdfplumber/cli.py
@@ -2,8 +2,9 @@
 import argparse
 import json
 import sys
+from collections import defaultdict, deque
 from itertools import chain
-from typing import List
+from typing import Any, DefaultDict, Dict, List
 
 from .pdf import PDF
 
@@ -22,6 +23,19 @@ def parse_args(args_raw: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer
     )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--structure",
+        help="Write the structure tree as JSON.  "
+        "All other arguments except --pages, --laparams, and --indent will be ignored",
+        action="store_true",
+    )
+    group.add_argument(
+        "--structure-text",
+        help="Write the structure tree as JSON including text contents.  "
+        "All other arguments except --pages, --laparams, and --indent will be ignored",
+        action="store_true",
+    )
 
     parser.add_argument("--format", choices=["csv", "json"], default="csv")
 
@@ -55,11 +69,39 @@ def parse_args(args_raw: List[str]) -> argparse.Namespace:
     return args
 
 
+def add_text_to_mcids(pdf: PDF, data: List[Dict[str, Any]]) -> None:
+    page_contents: DefaultDict[int, Any] = defaultdict(lambda: defaultdict(str))
+    for page in pdf.pages:
+        text_contents = page_contents[page.page_number]
+        for c in page.chars:
+            mcid = c.get("mcid")
+            if mcid is None:
+                continue
+            text_contents[mcid] += c["text"]
+    d = deque(data)
+    while d:
+        el = d.popleft()
+        if "children" in el:
+            d.extend(el["children"])
+        pageno = el.get("page_number")
+        if pageno is None:
+            continue
+        text_contents = page_contents[pageno]
+        if "mcids" in el:
+            el["text"] = [text_contents[mcid] for mcid in el["mcids"]]
+
+
 def main(args_raw: List[str] = sys.argv[1:]) -> None:
     args = parse_args(args_raw)
 
     with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
-        if args.format == "csv":
+        if args.structure:
+            print(json.dumps(pdf.structure_tree, indent=args.indent))
+        elif args.structure_text:
+            tree = pdf.structure_tree
+            add_text_to_mcids(pdf, tree)
+            print(json.dumps(tree, indent=args.indent, ensure_ascii=False))
+        elif args.format == "csv":
             pdf.to_csv(
                 sys.stdout,
                 args.types,

diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -29,6 +29,7 @@
 from . import utils
 from ._typing import T_bbox, T_num, T_obj, T_obj_list
 from .container import Container
+from .structure import PDFStructTree, StructTreeMissing
 from .table import T_table_settings, Table, TableFinder, TableSettings
 from .utils import decode_text, resolve_all, resolve_and_decode
 from .utils.text import TextMap
@@ -242,6 +243,14 @@ def width(self) -> T_num:
     def height(self) -> T_num:
         return self.bbox[3] - self.bbox[1]
 
+    @property
+    def structure_tree(self) -> List[Dict[str, Any]]:
+        """Return the structure tree for a page, if any."""
+        try:
+            return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)]
+        except StructTreeMissing:
+            return []
+
     @property
     def layout(self) -> LTPage:
         if hasattr(self, "_layout"):

diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py
@@ -16,6 +16,7 @@
 from .container import Container
 from .page import Page
 from .repair import _repair
+from .structure import PDFStructTree, StructTreeMissing
 from .utils import resolve_and_decode
 
 logger = logging.getLogger(__name__)
@@ -164,6 +165,14 @@ def hyperlinks(self) -> List[Dict[str, Any]]:
         gen = (p.hyperlinks for p in self.pages)
         return list(itertools.chain(*gen))
 
+    @property
+    def structure_tree(self) -> List[Dict[str, Any]]:
+        """Return the structure tree for the document."""
+        try:
+            return [elem.to_dict() for elem in PDFStructTree(self)]
+        except StructTreeMissing:
+            return []
+
     def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
         return {
             "metadata": self.metadata,