From 58790e29b101a129186982824910433f303b410a Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 25 Feb 2025 18:59:46 -0500 Subject: [PATCH] Superscripts --- pdftext/pdf/pages.py | 32 +++++++++++++++++++++++++++++++- pdftext/schema.py | 4 ++++ pdftext/scripts/extract_text.py | 19 +++++++++++++++++-- pyproject.toml | 2 +- 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py index 14fc04d..f30dc65 100644 --- a/pdftext/pdf/pages.py +++ b/pdftext/pdf/pages.py @@ -10,6 +10,35 @@ from pdftext.pdf.utils import flatten from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans +def assign_superscripts(lines: Lines, height_threshold: float = 0.8): + for line in lines: + prev_span = None + if len(line["spans"]) < 2: + continue + + for i, span in enumerate(line["spans"]): + is_first = i == 0 or not prev_span["text"].strip() + is_last = i == len(line["spans"]) - 1 or not line["spans"][i + 1]["text"].strip() + span_height = span["bbox"].height + span_top = span["bbox"].y_start + + prev_fullheight = is_first or span_height / max(1, prev_span["bbox"].height) <= height_threshold + next_fullheight = is_last or span_height / max(1, line["spans"][i + 1]["bbox"].height) <= height_threshold + + prev_above = is_first or span_top < prev_span["bbox"].y_start + next_above = is_last or span_top < line["spans"][i + 1]["bbox"].y_start + + if all([ + prev_fullheight, + next_fullheight, + prev_above, + next_above, + span["text"].strip() + ]): + span["superscript"] = True + + prev_span = span + def get_spans(chars: Chars) -> Spans: spans: Spans = [] @@ -45,7 +74,7 @@ def span_break(): continue # we also break on hyphenation - if span['text'].endswith("\x02"): + if span['text'].endswith("\x02") or span["text"].endswith("\n"): span_break() continue @@ -214,6 +243,7 @@ def get_pages( chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox)) spans = get_spans(chars) lines = get_lines(spans) + assign_superscripts(lines) blocks = get_blocks(lines) pages.append({ diff --git a/pdftext/schema.py b/pdftext/schema.py index c6e8e95..eecd73d 100644 --- a/pdftext/schema.py +++ b/pdftext/schema.py @@ -16,6 +16,9 @@ def __init__(self, bbox: List[float], ensure_nonzero_area=False): def __getitem__(self, item): return self.bbox[item] + def __repr__(self): + return f"Bbox({self.bbox})" + @property def height(self): return self.bbox[3] - self.bbox[1] @@ -140,6 +143,7 @@ class Span(TypedDict): char_end_idx: int rotation: int url: str + superscript: bool class Line(TypedDict): diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py index ced6f36..af6db1a 100644 --- a/pdftext/scripts/extract_text.py +++ b/pdftext/scripts/extract_text.py @@ -43,10 +43,25 @@ def extract_text_cli( assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided" if kwargs["json"]: - text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"]) + text = dictionary_output( + pdf_path, + sort=kwargs["sort"], + page_range=pages, + flatten_pdf=kwargs["flatten_pdf"], + keep_chars=kwargs["keep_chars"], + workers=kwargs["workers"], + disable_links=True + ) text = json.dumps(text) else: - text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"]) + text = plain_text_output( + pdf_path, + sort=kwargs["sort"], + hyphens=kwargs["keep_hyphens"], + page_range=pages, + flatten_pdf=kwargs["flatten_pdf"], + workers=kwargs["workers"] + ) if out_path is None: print(text) diff --git a/pyproject.toml b/pyproject.toml index 24c7e25..54821a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdftext" -version = "0.6.0" +version = "0.6.1" description = "Extract structured text from pdfs quickly" authors = ["Vik Paruchuri "] license = "Apache-2.0"