Skip to content

Commit

Permalink
add word level deduplication
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Feb 10, 2025
1 parent fd881b7 commit 5386da0
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 9 deletions.
76 changes: 69 additions & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import get_fontname
from pdftext.schema import Bbox, Chars
from pdftext.schema import Bbox, Char, Chars, Spans, Span


def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
Expand All @@ -15,11 +15,10 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
page_height = math.ceil(abs(y_end - y_start))

for i in range(textpage.count_chars()):
fontname, fontflag = get_fontname(textpage, i)
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))

rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
loosebox = rotation == 0 and (not text == "'" or quote_loosebox)
loosebox = (rotation == 0) and (text != "'" or quote_loosebox)

char_box = textpage.get_charbox(i, loose=loosebox)
cx_start, cy_start, cx_end, cy_end = char_box
Expand All @@ -33,18 +32,81 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
ty_end = page_height - cy_end

bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
bbox_coords = [round(x, 0) for x in bbox_coords]
bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation)

chars.append({
fontname, fontflag = get_fontname(textpage, i)
fontsize = pdfium_c.FPDFText_GetFontSize(textpage, i)
fontweight = pdfium_c.FPDFText_GetFontWeight(textpage, i)

char_dict: Char = {
"bbox": bbox,
"char": text,
"rotation": rotation,
"font": {
"name": fontname,
"flags": fontflag,
"size": pdfium_c.FPDFText_GetFontSize(textpage, i),
"weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
"size": fontsize,
"weight": fontweight,
},
"char_idx": i
})
}
chars.append(char_dict)

return chars


def deduplicate_chars(chars: Chars) -> Chars:
# we first construct words from the chars and then deduplicate them
words: Spans = []
word: Span = None

def word_break():
words.append({
"bbox": char["bbox"],
"text": char["char"],
"rotation": char["rotation"],
"font": char["font"],
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char],
"url": '',
})

for char in chars:
if words:
word = words[-1]

if not word:
word_break()
continue

# we also break on hyphenation
if any(word['text'].endswith(x) for x in ['\n', ' ', '\x02']):
word_break()
continue

# we break on any change in font info
if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
word_break()
continue

if char['rotation'] != word['rotation']:
word_break()
continue

word['text'] += char['char']
word['char_end_idx'] = char['char_idx']
word['bbox'] = word['bbox'].merge(char['bbox'])
word['chars'].append(char)

# deduplicate words
seen = {}
deduped = []
for word in words:
key = f"{word['bbox'].bbox}-{word['text'].strip()}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
if key not in seen:
seen[key] = True
deduped.append(word)

return [char for word in deduped for char in word['chars']]
4 changes: 2 additions & 2 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium

from pdftext.pdf.chars import get_chars
from pdftext.pdf.chars import get_chars, deduplicate_chars
from pdftext.pdf.utils import flatten
from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans

Expand Down Expand Up @@ -211,7 +211,7 @@ def get_pages(
except:
pass

chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars)
lines = get_lines(spans)
blocks = get_blocks(lines)
Expand Down

0 comments on commit 5386da0

Please sign in to comment.