Skip to content

Commit

Permalink
Enable flattening pdfs
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 7, 2024
1 parent ce2a62c commit ea9822b
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 117 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pdftext PDF_PATH --out_path output.txt
- `--keep_hyphens` will keep hyphens in the output (they will be stripped and words joined otherwise)
- `--pages` will specify pages (comma separated) to extract
- `--workers` specifies the number of parallel workers to use
- `--flatten_pdf` merges form fields into the PDF

## JSON

Expand All @@ -44,6 +45,7 @@ pdftext PDF_PATH --out_path output.txt --json
- `--pages` will specify pages (comma separated) to extract
- `--keep_chars` will keep individual characters in the json output
- `--workers` specifies the number of parallel workers to use
- `--flatten_pdf` merges form fields into the PDF

The output will be a json list, with each item in the list corresponding to a single page in the input pdf (in order). Each page will include the following keys:

Expand Down
9 changes: 4 additions & 5 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from functools import partial
from typing import List
from concurrent.futures import ProcessPoolExecutor
import math
Expand All @@ -12,13 +11,12 @@
from pdftext.settings import settings



def _load_pdf(pdf, flatten_pdf):
if isinstance(pdf, str):
pdf = pdfium.PdfDocument(pdf)
else:
if not isinstance(pdf, pdfium.PdfDocument):
raise TypeError("pdf must be a file path string or a PdfDocument object")

if not isinstance(pdf, pdfium.PdfDocument):
raise TypeError("pdf must be a file path string or a PdfDocument object")

# Must be called on the parent pdf, before the page was retrieved
if flatten_pdf:
Expand Down Expand Up @@ -66,6 +64,7 @@ def _get_pages(pdf_path, page_range=None, flatten_pdf=False, workers=None):

return ordered_pages


def plain_text_output(pdf_path, sort=False, hyphens=False, page_range=None, flatten_pdf=False, workers=None) -> str:
text = paginated_plain_text_output(pdf_path, sort=sort, hyphens=hyphens, page_range=page_range, workers=workers, flatten_pdf=flatten_pdf)
return "\n".join(text)
Expand Down
3 changes: 0 additions & 3 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from itertools import chain

import numpy as np
import sklearn

from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
from pdftext.settings import settings
Expand Down
9 changes: 8 additions & 1 deletion pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict, List

import pypdfium2.raw as pdfium_c
from pypdfium2 import PdfiumError

from pdftext.pdf.utils import get_fontname, pdfium_page_bbox_to_device_bbox, page_bbox_to_device_bbox
from pdftext.settings import settings
Expand All @@ -19,6 +20,12 @@ def update_previous_fonts(char_infos: List, i: int, prev_fontname: str, prev_fon
char_infos[j]["font"]["flags"] = fontflags


def flatten(page, flag=pdfium_c.FLAT_NORMALDISPLAY):
rc = pdfium_c.FPDFPage_Flatten(page, flag)
if rc == pdfium_c.FLATTEN_FAIL:
raise PdfiumError("Failed to flatten annotations / form fields.")


def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings.FONTNAME_SAMPLE_FREQ):
blocks = []

Expand All @@ -27,7 +34,7 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings

if flatten_pdf:
# Flatten form fields and annotations into page contents.
page._flatten(flag=pdfium_c.FLAT_NORMALDISPLAY)
flatten(pdf, page)

# Flattening invalidates existing handles to the page.
# It is necessary to re-initialize the page handle after flattening.
Expand Down
108 changes: 1 addition & 107 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ include = [
[tool.poetry.dependencies]
python = ">=3.9,<3.13,!=3.9.7"
pypdfium2 = "^4.29.0"
scikit-learn = "1.4.2"
pydantic = "^2.7.1"
pydantic-settings = "^2.2.1"

Expand Down

0 comments on commit ea9822b

Please sign in to comment.