Skip to content

Commit

Permalink
Merge pull request #36 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
VikParuchuri authored Feb 18, 2025
2 parents 92fd696 + 27971aa commit c10283f
Show file tree
Hide file tree
Showing 7 changed files with 359 additions and 285 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Integration test
on: [push]

env:
TORCH_DEVICE: "cpu"
PYTHONIOENCODING: "utf-8"

jobs:
build:
Expand Down
79 changes: 72 additions & 7 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pypdfium2.raw as pdfium_c

from pdftext.pdf.utils import get_fontname
from pdftext.schema import Bbox, Chars
from pdftext.schema import Bbox, Char, Chars, Spans, Span


def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotation: int, quote_loosebox=True) -> Chars:
Expand All @@ -15,11 +15,10 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
page_height = math.ceil(abs(y_end - y_start))

for i in range(textpage.count_chars()):
fontname, fontflag = get_fontname(textpage, i)
text = chr(pdfium_c.FPDFText_GetUnicode(textpage, i))

rotation = pdfium_c.FPDFText_GetCharAngle(textpage, i)
loosebox = rotation == 0 and (not text == "'" or quote_loosebox)
loosebox = (rotation == 0) and (text != "'" or quote_loosebox)

char_box = textpage.get_charbox(i, loose=loosebox)
cx_start, cy_start, cx_end, cy_end = char_box
Expand All @@ -35,16 +34,82 @@ def get_chars(textpage: pdfium.PdfTextPage, page_bbox: list[float], page_rotatio
bbox_coords = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
bbox = Bbox(bbox_coords).rotate(page_width, page_height, page_rotation)

chars.append({
fontname, fontflag = get_fontname(textpage, i)
fontsize = pdfium_c.FPDFText_GetFontSize(textpage, i)
fontweight = pdfium_c.FPDFText_GetFontWeight(textpage, i)

char_dict: Char = {
"bbox": bbox,
"char": text,
"rotation": rotation,
"font": {
"name": fontname,
"flags": fontflag,
"size": pdfium_c.FPDFText_GetFontSize(textpage, i),
"weight": pdfium_c.FPDFText_GetFontWeight(textpage, i),
"size": fontsize,
"weight": fontweight,
},
"char_idx": i
})
}
chars.append(char_dict)

return chars


def deduplicate_chars(chars: Chars) -> Chars:
# we first construct words from the chars and then deduplicate them
words: Spans = []
word: Span = None

def word_break():
words.append({
"bbox": char["bbox"],
"text": char["char"],
"rotation": char["rotation"],
"font": char["font"],
"char_start_idx": char["char_idx"],
"char_end_idx": char["char_idx"],
"chars": [char],
"url": '',
})

for char in chars:
if words:
word = words[-1]

if not word:
word_break()
continue

# we also break on hyphenation
if any(word['text'].endswith(x) for x in ['\n', ' ', '\x02']):
word_break()
continue

# we break on any change in font info
if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
word_break()
continue

if char['rotation'] != word['rotation']:
word_break()
continue

word['text'] += char['char']
word['char_end_idx'] = char['char_idx']
word['bbox'] = word['bbox'].merge(char['bbox'])
word['chars'].append(char)

# deduplicate words
seen = {}
deduped = []
for word in words:
# Round the bbox coordinates
bbox = word['bbox'].bbox
bbox = [round(x, 0) for x in bbox]

key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
if key not in seen:
seen[key] = True
deduped.append(word)

return [char for word in deduped for char in word['chars']]
1 change: 1 addition & 0 deletions pdftext/pdf/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def _rect_to_scaled_bbox(rect, page_bbox, page_rotation) -> List[float]:
ty_end = page_height - cy_end

bbox = [min(cx_start, cx_end), min(ty_start, ty_end), max(cx_start, cx_end), max(ty_start, ty_end)]
bbox = [round(x, 0) for x in bbox]
return Bbox(bbox).rotate(page_width, page_height, page_rotation).bbox


Expand Down
4 changes: 2 additions & 2 deletions pdftext/pdf/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium

from pdftext.pdf.chars import get_chars
from pdftext.pdf.chars import get_chars, deduplicate_chars
from pdftext.pdf.utils import flatten
from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans

Expand Down Expand Up @@ -211,7 +211,7 @@ def get_pages(
except:
pass

chars = get_chars(textpage, page_bbox, page_rotation, quote_loosebox)
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars)
lines = get_lines(spans)
blocks = get_blocks(lines)
Expand Down
2 changes: 1 addition & 1 deletion pdftext/scripts/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ def extract_text_cli(
if out_path is None:
print(text)
else:
with open(out_path, "w+") as f:
with open(out_path, "w+", encoding="utf-8") as f:
f.write(text)
Loading

0 comments on commit c10283f

Please sign in to comment.