Skip to content

Commit

Permalink
feat: add --structure-text, like pdfinfo -struct-text (but better)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Aug 10, 2023
1 parent 5a18a82 commit c1bacc5
Showing 1 changed file with 40 additions and 4 deletions.
44 changes: 40 additions & 4 deletions pdfplumber/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import argparse
import json
import sys
from collections import deque
from itertools import chain
from typing import List
from typing import Any, Dict, List

from .pdf import PDF

Expand All @@ -22,13 +23,20 @@ def parse_args(args_raw: List[str]) -> argparse.Namespace:
parser.add_argument(
"infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer
)

parser.add_argument(
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--structure",
help="Write the structure tree as JSON. "
"All arguments except --pages, --laparams, and --indent will be ignored",
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)
group.add_argument(
"--structure-text",
help="Write the structure tree as JSON including text contents. "
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)

parser.add_argument("--format", choices=["csv", "json"], default="csv")

parser.add_argument("--types", nargs="+")
Expand Down Expand Up @@ -61,12 +69,40 @@ def parse_args(args_raw: List[str]) -> argparse.Namespace:
return args


def add_text_to_mcids(pdf: PDF, data: List[Dict[str, Any]]) -> None:
page_contents: List[List[str]] = []
for idx, page in enumerate(pdf.pages):
while len(page_contents) <= idx:
page_contents.append([])
text_contents = page_contents[idx]
for c in page.chars:
mcid = c.get("mcid")
if mcid is None:
continue
while len(text_contents) <= mcid:
text_contents.append("")
text_contents[mcid] += c["text"]
d = deque(data)
while d:
el = d.popleft()
if "children" in el:
d.extend(el["children"])
if "page_number" in el:
text_contents = page_contents[el["page_number"] - 1]
if "mcids" in el:
el["text"] = [text_contents[mcid] for mcid in el["mcids"]]


def main(args_raw: List[str] = sys.argv[1:]) -> None:
args = parse_args(args_raw)

with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
if args.structure:
json.dump(pdf.structure_tree, sys.stdout, indent=args.indent)
elif args.structure_text:
tree = pdf.structure_tree
add_text_to_mcids(pdf, tree)
json.dump(tree, sys.stdout, indent=args.indent, ensure_ascii=False)
elif args.format == "csv":
pdf.to_csv(
sys.stdout,
Expand Down

0 comments on commit c1bacc5

Please sign in to comment.