Skip to content

Commit

Permalink
fix: Test cases for RTL programmatic PDFs and fixes for the formula m…
Browse files Browse the repository at this point in the history
…odel (#903)

fix: Support for RTL programmatic documents
fix(parser): detect and handle rotated pages
fix(parser): fix bug causing duplicated text
fix(formula): improve stopping criteria
chore: update lock file
fix: temporary constrain beautifulsoup


* switch to code formula model v1.0.1 and new test pdf

Signed-off-by: Matteo-Omenetti <[email protected]>

* switch to code formula model v1.0.1 and new test pdf

Signed-off-by: Matteo-Omenetti <[email protected]>

* cleaned up the data folder in the tests

Signed-off-by: Peter Staar <[email protected]>

* switch to code formula model v1.0.1 and new test pdf

Signed-off-by: Matteo-Omenetti <[email protected]>

* added three test-files for right-to-left

Signed-off-by: Peter Staar <[email protected]>

* fix black

Signed-off-by: Matteo-Omenetti <[email protected]>

* added new gt for test_e2e_conversion

Signed-off-by: Matteo-Omenetti <[email protected]>

* added new gt for test_e2e_conversion

Signed-off-by: Matteo-Omenetti <[email protected]>

* Add code to expose text direction of cell

Signed-off-by: Christoph Auer <[email protected]>

* new test file

Signed-off-by: Matteo-Omenetti <[email protected]>

* update lock

Signed-off-by: Michele Dolfi <[email protected]>

* fix mypy reports

Signed-off-by: Michele Dolfi <[email protected]>

* fix example filepaths

Signed-off-by: Michele Dolfi <[email protected]>

* add test data results

Signed-off-by: Michele Dolfi <[email protected]>

* pin wheel of latest docling-parse release

Signed-off-by: Michele Dolfi <[email protected]>

* use latest docling-core

Signed-off-by: Michele Dolfi <[email protected]>

* remove debugging code

Signed-off-by: Michele Dolfi <[email protected]>

* fix path to files in example

Signed-off-by: Michele Dolfi <[email protected]>

* Revert unwanted RTL additions

Signed-off-by: Christoph Auer <[email protected]>

* Fix test data paths in examples

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Matteo-Omenetti <[email protected]>
Signed-off-by: Peter Staar <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Co-authored-by: Matteo-Omenetti <[email protected]>
Co-authored-by: Peter Staar <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
4 people authored Feb 7, 2025
1 parent ed74fe2 commit 9114ada
Show file tree
Hide file tree
Showing 91 changed files with 620 additions and 313 deletions.
7 changes: 4 additions & 3 deletions docling/models/code_formula_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union

import numpy as np
from docling_core.types.doc import (
CodeItem,
DocItemLabel,
Expand Down Expand Up @@ -103,7 +104,7 @@ def __init__(
artifacts_path = artifacts_path / self._model_repo_folder

self.code_formula_model = CodeFormulaPredictor(
artifacts_path=artifacts_path,
artifacts_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
)
Expand All @@ -123,7 +124,7 @@ def download_models(
repo_id="ds4sd/CodeFormula",
force_download=force,
local_dir=local_dir,
revision="v1.0.0",
revision="v1.0.1",
)

return Path(download_path)
Expand Down Expand Up @@ -231,7 +232,7 @@ def __call__(
return

labels: List[str] = []
images: List[Image.Image] = []
images: List[Union[Image.Image, np.ndarray]] = []
elements: List[TextItem] = []
for el in element_batch:
assert isinstance(el.item, TextItem)
Expand Down
5 changes: 3 additions & 2 deletions docling/models/document_picture_classifier.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union

import numpy as np
from docling_core.types.doc import (
DoclingDocument,
NodeItem,
Expand Down Expand Up @@ -94,7 +95,7 @@ def __init__(
artifacts_path = artifacts_path / self._model_repo_folder

self.document_picture_classifier = DocumentFigureClassifierPredictor(
artifacts_path=artifacts_path,
artifacts_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
)
Expand Down Expand Up @@ -161,7 +162,7 @@ def __call__(
yield element
return

images: List[Image.Image] = []
images: List[Union[Image.Image, np.ndarray]] = []
elements: List[PictureItem] = []
for el in element_batch:
assert isinstance(el, PictureItem)
Expand Down
4 changes: 3 additions & 1 deletion docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,12 @@ def __call__(
else:
with TimeRecorder(conv_res, "layout"):
assert page.size is not None
page_image = page.get_image(scale=1.0)
assert page_image is not None

clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
self.layout_predictor.predict(page_image)
):
label = DocItemLabel(
pred_item["label"]
Expand Down
8 changes: 4 additions & 4 deletions docs/examples/batch_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110_sampled.pdf"),
Path("./tests/data/pdf/2206.01062.pdf"),
Path("./tests/data/pdf/2203.01017v2.pdf"),
Path("./tests/data/pdf/2305.03393v1.pdf"),
Path("./tests/data/pdf/redp5110_sampled.pdf"),
]

# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/custom_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

###########################################################################

Expand Down
2 changes: 1 addition & 1 deletion docs/examples/develop_formula_understanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_default_options(cls) -> ExampleFormulaUnderstandingPipelineOptions:
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2203.01017v2.pdf")
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")

pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
pipeline_options.do_formula_understanding = True
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/develop_picture_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions:
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/export_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/export_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/export_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

doc_converter = DocumentConverter()
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/full_page_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def main():
input_doc = Path("./tests/data/2206.01062.pdf")
input_doc = Path("./tests/data/pdf/2206.01062.pdf")

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/inspect_picture_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

source = "tests/data/amt_handbook_sample.pdf"
source = "tests/data/pdf/amt_handbook_sample.pdf"

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/run_with_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def main():
input_doc = Path("./tests/data/2206.01062.pdf")
input_doc = Path("./tests/data/pdf/2206.01062.pdf")

# Explicitly set the accelerator
# accelerator_options = AcceleratorOptions(
Expand Down
5 changes: 2 additions & 3 deletions docs/examples/run_with_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ def main():
Path("tests/data/docx/lorem_ipsum.docx"),
Path("tests/data/pptx/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/pdf/2206.01062.pdf"),
Path("tests/data/asciidoc/test_01.asciidoc"),
]

## for defaults use:
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/tesseract_lang_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


def main():
input_doc = Path("./tests/data/2206.01062.pdf")
input_doc = Path("./tests/data/pdf/2206.01062.pdf")

# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"])
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
Expand Down
10 changes: 5 additions & 5 deletions docs/v2.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,12 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
## Convert several files at once:

input_files = [
"tests/data/wiki_duck.html",
"tests/data/word_sample.docx",
"tests/data/lorem_ipsum.docx",
"tests/data/powerpoint_sample.pptx",
"tests/data/html/wiki_duck.html",
"tests/data/docx/word_sample.docx",
"tests/data/docx/lorem_ipsum.docx",
"tests/data/pptx/powerpoint_sample.pptx",
"tests/data/2305.03393v1-pg9-img.png",
"tests/data/2206.01062.pdf",
"tests/data/pdf/2206.01062.pdf",
]

# Directly pass list of files or streams to `convert_all`
Expand Down
Loading

0 comments on commit 9114ada

Please sign in to comment.