Skip to content

Commit

Permalink
Fix font issue with spans
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 7, 2024
1 parent ebace90 commit 50398d5
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 3 deletions.
4 changes: 3 additions & 1 deletion pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
prediction_probs = yield training_row
# First item is probability of same line/block, second is probability of new line, third is probability of new block
if prediction_probs[0] >= .5:
pass
# Ensure we update spans properly for font info when predicting no new line
if prev_font_info != font_info:
span = update_span(line, span)
elif prediction_probs[2] > block_threshold:
span = update_span(line, span)
line = update_line(block, line)
Expand Down
2 changes: 1 addition & 1 deletion pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def get_pdfium_chars(pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ, pa

rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * 180 / math.pi # convert from radians to degrees
coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when page is rotated
coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, page_rotation, normalize=True)

char_info = {
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.6"
version = "0.3.7"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <[email protected]>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 50398d5

Please sign in to comment.