From 58790e29b101a129186982824910433f303b410a Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 25 Feb 2025 18:59:46 -0500
Subject: [PATCH] Superscripts

---
 pdftext/pdf/pages.py            | 32 +++++++++++++++++++++++++++++++-
 pdftext/schema.py               |  4 ++++
 pdftext/scripts/extract_text.py | 19 +++++++++++++++++--
 pyproject.toml                  |  2 +-
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py
index 14fc04d..f30dc65 100644
--- a/pdftext/pdf/pages.py
+++ b/pdftext/pdf/pages.py
@@ -10,6 +10,35 @@
 from pdftext.pdf.utils import flatten
 from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans
 
+def assign_superscripts(lines: Lines, height_threshold: float = 0.8):
+    for line in lines:
+        prev_span = None
+        if len(line["spans"]) < 2:
+            continue
+
+        for i, span in enumerate(line["spans"]):
+            is_first = i == 0 or not prev_span["text"].strip()
+            is_last = i == len(line["spans"]) - 1 or not line["spans"][i + 1]["text"].strip()
+            span_height = span["bbox"].height
+            span_top = span["bbox"].y_start
+
+            prev_fullheight = is_first or span_height / max(1, prev_span["bbox"].height) <= height_threshold
+            next_fullheight = is_last or span_height / max(1, line["spans"][i + 1]["bbox"].height) <= height_threshold
+
+            prev_above = is_first or span_top < prev_span["bbox"].y_start
+            next_above = is_last or span_top < line["spans"][i + 1]["bbox"].y_start
+
+            if all([
+                prev_fullheight,
+                next_fullheight,
+                prev_above,
+                next_above,
+                span["text"].strip()
+            ]):
+                span["superscript"] = True
+
+            prev_span = span
+
 
 def get_spans(chars: Chars) -> Spans:
     spans: Spans = []
@@ -45,7 +74,7 @@ def span_break():
             continue
 
         # we also break on hyphenation
-        if span['text'].endswith("\x02"):
+        if span['text'].endswith("\x02") or span["text"].endswith("\n"):
             span_break()
             continue
 
@@ -214,6 +243,7 @@ def get_pages(
         chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
         spans = get_spans(chars)
         lines = get_lines(spans)
+        assign_superscripts(lines)
         blocks = get_blocks(lines)
 
         pages.append({
diff --git a/pdftext/schema.py b/pdftext/schema.py
index c6e8e95..eecd73d 100644
--- a/pdftext/schema.py
+++ b/pdftext/schema.py
@@ -16,6 +16,9 @@ def __init__(self, bbox: List[float], ensure_nonzero_area=False):
     def __getitem__(self, item):
         return self.bbox[item]
 
+    def __repr__(self):
+        return f"Bbox({self.bbox})"
+
     @property
     def height(self):
         return self.bbox[3] - self.bbox[1]
@@ -140,6 +143,7 @@ class Span(TypedDict):
     char_end_idx: int
     rotation: int
     url: str
+    superscript: bool
 
 
 class Line(TypedDict):
diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py
index ced6f36..af6db1a 100644
--- a/pdftext/scripts/extract_text.py
+++ b/pdftext/scripts/extract_text.py
@@ -43,10 +43,25 @@ def extract_text_cli(
         assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"
 
     if kwargs["json"]:
-        text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
+        text = dictionary_output(
+            pdf_path,
+            sort=kwargs["sort"],
+            page_range=pages,
+            flatten_pdf=kwargs["flatten_pdf"],
+            keep_chars=kwargs["keep_chars"],
+            workers=kwargs["workers"],
+            disable_links=True
+        )
         text = json.dumps(text)
     else:
-        text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])
+        text = plain_text_output(
+            pdf_path,
+            sort=kwargs["sort"],
+            hyphens=kwargs["keep_hyphens"],
+            page_range=pages,
+            flatten_pdf=kwargs["flatten_pdf"],
+            workers=kwargs["workers"]
+        )
 
     if out_path is None:
         print(text)
diff --git a/pyproject.toml b/pyproject.toml
index 24c7e25..54821a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.6.0"
+version = "0.6.1"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"