Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #36

Merged
merged 4 commits into from
Feb 18, 2025
Merged

Dev #36

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Integration test
on: [push]

env:
TORCH_DEVICE: "cpu"
PYTHONIOENCODING: "utf-8"

jobs:
build:
Expand Down
79 changes: 72 additions & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import get_fontname
from pdftext.schema import Bbox, Chars
from pdftext.schema import Bbox, Char, Chars, Spans, Span


def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
Expand All @@ -15,11 +15,10 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
page_height = math.ceil(abs(y_end - y_start))

for i in range(textpage.count_chars()):
fontname, fontflag = get_fontname(textpage, i)
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))

rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
loosebox = rotation == 0 and (not text == "'" or quote_loosebox)
loosebox = (rotation == 0) and (text != "'" or quote_loosebox)

char_box = textpage.get_charbox(i, loose=loosebox)
cx_start, cy_start, cx_end, cy_end = char_box
Expand All @@ -35,16 +34,82 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation)

chars.append({
fontname, fontflag = get_fontname(textpage, i)
fontsize = pdfium_c.FPDFText_GetFontSize(textpage, i)
fontweight = pdfium_c.FPDFText_GetFontWeight(textpage, i)

char_dict: Char = {
"bbox": bbox,
"char": text,
"rotation": rotation,
"font": {
"name": fontname,
"flags": fontflag,
"size": pdfium_c.FPDFText_GetFontSize(textpage, i),
"weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
"size": fontsize,
"weight": fontweight,
},
"char_idx": i
})
}
chars.append(char_dict)

return chars


def deduplicate_chars(chars: Chars) -> Chars:
# we first construct words from the chars and then deduplicate them
words: Spans = []
word: Span = None

def word_break():
words.append({
"bbox": char["bbox"],
"text": char["char"],
"rotation": char["rotation"],
"font": char["font"],
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char],
"url": '',
})

for char in chars:
if words:
word = words[-1]

if not word:
word_break()
continue

# we also break on hyphenation
if any(word['text'].endswith(x) for x in ['\n', ' ', '\x02']):
word_break()
continue

# we break on any change in font info
if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
word_break()
continue

if char['rotation'] != word['rotation']:
word_break()
continue

word['text'] += char['char']
word['char_end_idx'] = char['char_idx']
word['bbox'] = word['bbox'].merge(char['bbox'])
word['chars'].append(char)

# deduplicate words
seen = {}
deduped = []
for word in words:
# Round the bbox coordinates
bbox = word['bbox'].bbox
bbox = [round(x, 0) for x in bbox]

key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
if key not in seen:
seen[key] = True
deduped.append(word)

return [char for word in deduped for char in word['chars']]
1 change: 1 addition & 0 deletions pdftext/pdf/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]:
ty_end = page_height - cy_end

bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
bbox = [round(x, 0) for x in bbox]
return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox


Expand Down
4 changes: 2 additions & 2 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium

from pdftext.pdf.chars import get_chars
from pdftext.pdf.chars import get_chars, deduplicate_chars
from pdftext.pdf.utils import flatten
from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans

Expand Down Expand Up @@ -211,7 +211,7 @@ def get_pages(
except:
pass

chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars)
lines = get_lines(spans)
blocks = get_blocks(lines)
Expand Down
2 changes: 1 addition & 1 deletion pdftext/scripts/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ def extract_text_cli(
if out_path is None:
print(text)
else:
with open(out_path, "w+") as f:
with open(out_path, "w+", encoding="utf-8") as f:
f.write(text)
Loading