Skip to content

Commit

Permalink
Superscripts
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Feb 25, 2025
1 parent 27971aa commit 58790e2
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 4 deletions.
32 changes: 31 additions & 1 deletion pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,35 @@
from pdftext.pdf.utils import flatten
from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans

def assign_superscripts(lines: Lines, height_threshold: float = 0.8):
for line in lines:
prev_span = None
if len(line["spans"]) < 2:
continue

for i, span in enumerate(line["spans"]):
is_first = i == 0 or not prev_span["text"].strip()
is_last = i == len(line["spans"]) - 1 or not line["spans"][i + 1]["text"].strip()
span_height = span["bbox"].height
span_top = span["bbox"].y_start

prev_fullheight = is_first or span_height / max(1, prev_span["bbox"].height) <= height_threshold
next_fullheight = is_last or span_height / max(1, line["spans"][i + 1]["bbox"].height) <= height_threshold

prev_above = is_first or span_top < prev_span["bbox"].y_start
next_above = is_last or span_top < line["spans"][i + 1]["bbox"].y_start

if all([
prev_fullheight,
next_fullheight,
prev_above,
next_above,
span["text"].strip()
]):
span["superscript"] = True

prev_span = span


def get_spans(chars: Chars) -> Spans:
spans: Spans = []
Expand Down Expand Up @@ -45,7 +74,7 @@ def span_break():
continue

# we also break on hyphenation
if span['text'].endswith("\x02"):
if span['text'].endswith("\x02") or span["text"].endswith("\n"):
span_break()
continue

Expand Down Expand Up @@ -214,6 +243,7 @@ def get_pages(
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars)
lines = get_lines(spans)
assign_superscripts(lines)
blocks = get_blocks(lines)

pages.append({
Expand Down
4 changes: 4 additions & 0 deletions pdftext/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ def __init__(self, bbox: List[float], ensure_nonzero_area=False):
def __getitem__(self, item):
return self.bbox[item]

def __repr__(self):
return f"Bbox({self.bbox})"

@property
def height(self):
return self.bbox[3] - self.bbox[1]
Expand Down Expand Up @@ -140,6 +143,7 @@ class Span(TypedDict):
char_end_idx: int
rotation: int
url: str
superscript: bool


class Line(TypedDict):
Expand Down
19 changes: 17 additions & 2 deletions pdftext/scripts/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,25 @@ def extract_text_cli(
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"

if kwargs["json"]:
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
text = dictionary_output(
pdf_path,
sort=kwargs["sort"],
page_range=pages,
flatten_pdf=kwargs["flatten_pdf"],
keep_chars=kwargs["keep_chars"],
workers=kwargs["workers"],
disable_links=True
)
text = json.dumps(text)
else:
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])
text = plain_text_output(
pdf_path,
sort=kwargs["sort"],
hyphens=kwargs["keep_hyphens"],
page_range=pages,
flatten_pdf=kwargs["flatten_pdf"],
workers=kwargs["workers"]
)

if out_path is None:
print(text)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.6.0"
version = "0.6.1"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <[email protected]>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 58790e2

Please sign in to comment.