VikParuchuri · VikParuchuri · Feb 18, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 12, 2025
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -3,7 +3,7 @@ name: Integration test
 on: [push]
 
 env:
-  TORCH_DEVICE: "cpu"
+  PYTHONIOENCODING: "utf-8"
 
 jobs:
   build:

diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -4,7 +4,7 @@
 import pypdfium2.raw as pdfium_c
 
 from pdftext.pdf.utils import get_fontname
-from pdftext.schema import Bbox, Chars
+from pdftext.schema import Bbox, Char, Chars, Spans, Span
 
 
 def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
@@ -15,11 +15,10 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
     page_height = math.ceil(abs(y_end - y_start))
 
     for i in range(textpage.count_chars()):
-        fontname, fontflag = get_fontname(textpage, i)
         text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))
 
         rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
-        loosebox = rotation == 0 and (not text == "'" or quote_loosebox)
+        loosebox = (rotation == 0) and (text != "'" or quote_loosebox)
 
         char_box = textpage.get_charbox(i, loose=loosebox)
         cx_start, cy_start, cx_end, cy_end = char_box
@@ -35,16 +34,82 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
         bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
         bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation)
 
-        chars.append({
+        fontname, fontflag = get_fontname(textpage, i)
+        fontsize = pdfium_c.FPDFText_GetFontSize(textpage, i)
+        fontweight = pdfium_c.FPDFText_GetFontWeight(textpage, i)
+
+        char_dict: Char = {
             "bbox": bbox,
             "char": text,
             "rotation": rotation,
             "font": {
                 "name": fontname,
                 "flags": fontflag,
-                "size": pdfium_c.FPDFText_GetFontSize(textpage, i),
-                "weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
+                "size": fontsize,
+                "weight": fontweight,
             },
             "char_idx": i
-        })
+        }
+        chars.append(char_dict)
+
     return chars
+
+
+def deduplicate_chars(chars: Chars) -> Chars:
+    # we first construct words from the chars and then deduplicate them
+    words: Spans = []
+    word: Span = None
+
+    def word_break():
+        words.append({
+            "bbox": char["bbox"],
+            "text": char["char"],
+            "rotation": char["rotation"],
+            "font": char["font"],
+            "char_start_idx": char["char_idx"],
+            "char_end_idx": char["char_idx"],
+            "chars": [char],
+            "url": '',
+        })
+
+    for char in chars:
+        if words:
+            word = words[-1]
+
+        if not word:
+            word_break()
+            continue
+
+        # we also break on hyphenation
+        if any(word['text'].endswith(x) for x in ['\n', ' ', '\x02']):
+            word_break()
+            continue
+
+        # we break on any change in font info
+        if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
+            word_break()
+            continue
+
+        if char['rotation'] != word['rotation']:
+            word_break()
+            continue
+
+        word['text'] += char['char']
+        word['char_end_idx'] = char['char_idx']
+        word['bbox'] = word['bbox'].merge(char['bbox'])
+        word['chars'].append(char)
+
+    # deduplicate words
+    seen = {}
+    deduped = []
+    for word in words:
+        # Round the bbox coordinates
+        bbox = word['bbox'].bbox
+        bbox = [round(x, 0) for x in bbox]
+
+        key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
+        if key not in seen:
+            seen[key] = True
+            deduped.append(word)
+
+    return [char for word in deduped for char in word['chars']]
diff --git a/pdftext/pdf/links.py b/pdftext/pdf/links.py
@@ -40,6 +40,7 @@ def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]:
     ty_end = page_height - cy_end
 
     bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
+    bbox = [round(x, 0) for x in bbox]
     return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox
 
 

diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py
@@ -6,7 +6,7 @@
 
 import pypdfium2 as pdfium
 
-from pdftext.pdf.chars import get_chars
+from pdftext.pdf.chars import get_chars, deduplicate_chars
 from pdftext.pdf.utils import flatten
 from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans
 
@@ -211,7 +211,7 @@ def get_pages(
         except:
             pass
 
-        chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
+        chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
         spans = get_spans(chars)
         lines = get_lines(spans)
         blocks = get_blocks(lines)

diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py
@@ -51,5 +51,5 @@ def extract_text_cli(
     if out_path is None:
         print(text)
     else:
-        with open(out_path, "w+") as f:
+        with open(out_path, "w+", encoding="utf-8") as f:
             f.write(text)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,7 @@ name: Integration test @@
     on: [push]
     env:
-      TORCH_DEVICE: "cpu"
+      PYTHONIOENCODING: "utf-8"
     jobs:
       build:
@@ Expand Down @@