From 5386da0b1b74cad3b3b6686166a979fe48ae5f6b Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Mon, 10 Feb 2025 10:54:08 +0000 Subject: [PATCH] add word level deduplication --- pdftext/pdf/chars.py | 76 ++++++++++++++++++++++++++++++++++++++++---- pdftext/pdf/pages.py | 4 +-- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py index 534d303..4dc744b 100644 --- a/pdftext/pdf/chars.py +++ b/pdftext/pdf/chars.py @@ -4,7 +4,7 @@ import pypdfium2.raw as pdfium_c from pdftext.pdf.utils import get_fontname -from pdftext.schema import Bbox, Chars +from pdftext.schema import Bbox, Char, Chars, Spans, Span def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars: @@ -15,11 +15,10 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio page_height = math.ceil(abs(y_end - y_start)) for i in range(textpage.count_chars()): - fontname, fontflag = get_fontname(textpage, i) text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i)) rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i) - loosebox = rotation == 0 and (not text == "'" or quote_loosebox) + loosebox = (rotation == 0) and (text != "'" or quote_loosebox) char_box = textpage.get_charbox(i, loose=loosebox) cx_start, cy_start, cx_end, cy_end = char_box @@ -33,18 +32,81 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio ty_end = page_height - cy_end bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)] + bbox_coords = [round(x, 0) for x in bbox_coords] bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation) - chars.append({ + fontname, fontflag = get_fontname(textpage, i) + fontsize = pdfium_c.FPDFText_GetFontSize(textpage, i) + fontweight = pdfium_c.FPDFText_GetFontWeight(textpage, i) + + char_dict: Char = { "bbox": bbox, "char": text, "rotation": rotation, "font": { "name": fontname, "flags": fontflag, - "size": pdfium_c.FPDFText_GetFontSize(textpage, i), - "weight": pdfium_c.FPDFText_GetFontWeight(textpage, i), + "size": fontsize, + "weight": fontweight, }, "char_idx": i - }) + } + chars.append(char_dict) + return chars + + +def deduplicate_chars(chars: Chars) -> Chars: + # we first construct words from the chars and then deduplicate them + words: Spans = [] + word: Span = None + + def word_break(): + words.append({ + "bbox": char["bbox"], + "text": char["char"], + "rotation": char["rotation"], + "font": char["font"], + "char_start_idx": char["char_idx"], + "char_end_idx": char["char_idx"], + "chars": [char], + "url": '', + }) + + for char in chars: + if words: + word = words[-1] + + if not word: + word_break() + continue + + # we also break on hyphenation + if any(word['text'].endswith(x) for x in ['\n', ' ', '\x02']): + word_break() + continue + + # we break on any change in font info + if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']): + word_break() + continue + + if char['rotation'] != word['rotation']: + word_break() + continue + + word['text'] += char['char'] + word['char_end_idx'] = char['char_idx'] + word['bbox'] = word['bbox'].merge(char['bbox']) + word['chars'].append(char) + + # deduplicate words + seen = {} + deduped = [] + for word in words: + key = f"{word['bbox'].bbox}-{word['text'].strip()}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}" + if key not in seen: + seen[key] = True + deduped.append(word) + + return [char for word in deduped for char in word['chars']] diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py index bc1836f..14fc04d 100644 --- a/pdftext/pdf/pages.py +++ b/pdftext/pdf/pages.py @@ -6,7 +6,7 @@ import pypdfium2 as pdfium -from pdftext.pdf.chars import get_chars +from pdftext.pdf.chars import get_chars, deduplicate_chars from pdftext.pdf.utils import flatten from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans @@ -211,7 +211,7 @@ def get_pages( except: pass - chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox) + chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox)) spans = get_spans(chars) lines = get_lines(spans) blocks = get_blocks(lines)