Skip to content

Commit

Permalink
Merge pull request #963 from dhdaines/structure_tree
Browse files Browse the repository at this point in the history
Support for PDF 1.3 logical structure
  • Loading branch information
jsvine authored Nov 9, 2023
2 parents ba58e16 + 036044d commit 35ed9e0
Show file tree
Hide file tree
Showing 15 changed files with 1,595 additions and 2 deletions.
61 changes: 61 additions & 0 deletions docs/structure.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Structure Tree

Since PDF 1.3 it is possible for a PDF to contain logical structure,
contained in a *structure tree*. In conjunction with PDF 1.2 [marked
content sections](#marked-content-sections) this forms the basis of
Tagged PDF and other accessibility features.

Unfortunately, since all of these standards are optional and variably
implemented in PDF authoring tools, and are frequently not enabled by
default, it is not possible to rely on them to extract the structure
of a PDF and associated content. Nonetheless they can be useful as
features for a heuristic or machine-learning based system, or for
extracting particular structures such as tables.

Since `pdfplumber`'s API is page-based, the structure is available for
a particular page, using the `structure_tree` attribute:

with pdfplumber.open(pdffile) as pdf:
for element in pdf.pages[0].structure_tree:
print(element["type"], element["mcids"])
for child in element.children:
print(child["type"], child["mcids"])

The `type` field contains the type of the structure element - the
standard structure types can be seen in section 10.7.3 of [the PDF 1.7
reference
document](https://ghostscript.com/~robin/pdf_reference17.pdf#page=898),
but usually they are rather HTML-like, if created by a recent PDF
authoring tool (notably, older tools may simply produce `P` for
everything).

The `mcids` field contains the list of marked content section IDs
corresponding to this element.

The `lang` field is often present as well, and contains a language
code for the text content, e.g. `"EN-US"` or `"FR-CA"`.

The `alt_text` field will be present if the author has helpfully added
alternate text to an image. In some cases, `actual_text` may also be
present.

There are also various attributes that may be in the `attributes`
field. Some of these are quite useful indeed, such as ``BBox` which
gives you the bounding box of a `Table`, `Figure`, or `Image`. You
can see a full list of these [in the PDF
spec](https://ghostscript.com/~robin/pdf_reference17.pdf#page=916).
Note that the `BBox` is in PDF coordinate space with the origin at the
bottom left of the page. To convert it to `pdfplumber`'s space you
can do, for example:

x0, y0, x1, y1 = element['attributes']['BBox']
top = page.height - y1
bottom = page.height - y0
doctop = page.initial_doctop + top
bbox = (x0, top, x1, bottom)

It is also possible to get the structure tree for the entire document.
In this case, because marked content IDs are specific to a given page,
each element will also have a `page_number` attribute, which is the
number of the page containing (partially or completely) this element,
indexed from 1 (for consistency with `pdfplumber.Page`).
46 changes: 44 additions & 2 deletions pdfplumber/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import argparse
import json
import sys
from collections import defaultdict, deque
from itertools import chain
from typing import List
from typing import Any, DefaultDict, Dict, List

from .pdf import PDF

Expand All @@ -22,6 +23,19 @@ def parse_args(args_raw: List[str]) -> argparse.Namespace:
parser.add_argument(
"infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--structure",
help="Write the structure tree as JSON. "
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)
group.add_argument(
"--structure-text",
help="Write the structure tree as JSON including text contents. "
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)

parser.add_argument("--format", choices=["csv", "json"], default="csv")

Expand Down Expand Up @@ -55,11 +69,39 @@ def parse_args(args_raw: List[str]) -> argparse.Namespace:
return args


def add_text_to_mcids(pdf: PDF, data: List[Dict[str, Any]]) -> None:
page_contents: DefaultDict[int, Any] = defaultdict(lambda: defaultdict(str))
for page in pdf.pages:
text_contents = page_contents[page.page_number]
for c in page.chars:
mcid = c.get("mcid")
if mcid is None:
continue
text_contents[mcid] += c["text"]
d = deque(data)
while d:
el = d.popleft()
if "children" in el:
d.extend(el["children"])
pageno = el.get("page_number")
if pageno is None:
continue
text_contents = page_contents[pageno]
if "mcids" in el:
el["text"] = [text_contents[mcid] for mcid in el["mcids"]]


def main(args_raw: List[str] = sys.argv[1:]) -> None:
args = parse_args(args_raw)

with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
if args.format == "csv":
if args.structure:
print(json.dumps(pdf.structure_tree, indent=args.indent))
elif args.structure_text:
tree = pdf.structure_tree
add_text_to_mcids(pdf, tree)
print(json.dumps(tree, indent=args.indent, ensure_ascii=False))
elif args.format == "csv":
pdf.to_csv(
sys.stdout,
args.types,
Expand Down
9 changes: 9 additions & 0 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_list
from .container import Container
from .structure import PDFStructTree, StructTreeMissing
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils import decode_text, resolve_all, resolve_and_decode
from .utils.text import TextMap
Expand Down Expand Up @@ -242,6 +243,14 @@ def width(self) -> T_num:
def height(self) -> T_num:
return self.bbox[3] - self.bbox[1]

@property
def structure_tree(self) -> List[Dict[str, Any]]:
"""Return the structure tree for a page, if any."""
try:
return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)]
except StructTreeMissing:
return []

@property
def layout(self) -> LTPage:
if hasattr(self, "_layout"):
Expand Down
9 changes: 9 additions & 0 deletions pdfplumber/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .container import Container
from .page import Page
from .repair import _repair
from .structure import PDFStructTree, StructTreeMissing
from .utils import resolve_and_decode

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -164,6 +165,14 @@ def hyperlinks(self) -> List[Dict[str, Any]]:
gen = (p.hyperlinks for p in self.pages)
return list(itertools.chain(*gen))

@property
def structure_tree(self) -> List[Dict[str, Any]]:
"""Return the structure tree for the document."""
try:
return [elem.to_dict() for elem in PDFStructTree(self)]
except StructTreeMissing:
return []

def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
return {
"metadata": self.metadata,
Expand Down
Loading

0 comments on commit 35ed9e0

Please sign in to comment.