fix: Test cases for RTL programmatic PDFs and fixes for the formula m…

…odel (#903) fix: Support for RTL programmatic documents fix(parser): detect and handle rotated pages fix(parser): fix bug causing duplicated text fix(formula): improve stopping criteria chore: update lock file fix: temporary constrain beautifulsoup * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <[email protected]> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <[email protected]> * cleaned up the data folder in the tests Signed-off-by: Peter Staar <[email protected]> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <[email protected]> * added three test-files for right-to-left Signed-off-by: Peter Staar <[email protected]> * fix black Signed-off-by: Matteo-Omenetti <[email protected]> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <[email protected]> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <[email protected]> * Add code to expose text direction of cell Signed-off-by: Christoph Auer <[email protected]> * new test file Signed-off-by: Matteo-Omenetti <[email protected]> * update lock Signed-off-by: Michele Dolfi <[email protected]> * fix mypy reports Signed-off-by: Michele Dolfi <[email protected]> * fix example filepaths Signed-off-by: Michele Dolfi <[email protected]> * add test data results Signed-off-by: Michele Dolfi <[email protected]> * pin wheel of latest docling-parse release Signed-off-by: Michele Dolfi <[email protected]> * use latest docling-core Signed-off-by: Michele Dolfi <[email protected]> * remove debugging code Signed-off-by: Michele Dolfi <[email protected]> * fix path to files in example Signed-off-by: Michele Dolfi <[email protected]> * Revert unwanted RTL additions Signed-off-by: Christoph Auer <[email protected]> * Fix test data paths in examples Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Matteo-Omenetti <[email protected]> Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Matteo-Omenetti <[email protected]> Co-authored-by: Peter Staar <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
DS4SD · Feb 7, 2025 · 9114ada · 9114ada
1 parent ed74fe2
commit 9114ada
Show file tree

Hide file tree

Showing 91 changed files with 620 additions and 313 deletions.
diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union
 
+import numpy as np
 from docling_core.types.doc import (
     CodeItem,
     DocItemLabel,
@@ -103,7 +104,7 @@ def __init__(
                 artifacts_path = artifacts_path / self._model_repo_folder
 
             self.code_formula_model = CodeFormulaPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                 device=device,
                 num_threads=accelerator_options.num_threads,
             )
@@ -123,7 +124,7 @@ def download_models(
             repo_id="ds4sd/CodeFormula",
             force_download=force,
             local_dir=local_dir,
-            revision="v1.0.0",
+            revision="v1.0.1",
         )
 
         return Path(download_path)
@@ -231,7 +232,7 @@ def __call__(
             return
 
         labels: List[str] = []
-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[TextItem] = []
         for el in element_batch:
             assert isinstance(el.item, TextItem)

diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Iterable, List, Literal, Optional, Tuple, Union
 
+import numpy as np
 from docling_core.types.doc import (
     DoclingDocument,
     NodeItem,
@@ -94,7 +95,7 @@ def __init__(
                 artifacts_path = artifacts_path / self._model_repo_folder
 
             self.document_picture_classifier = DocumentFigureClassifierPredictor(
-                artifacts_path=artifacts_path,
+                artifacts_path=str(artifacts_path),
                 device=device,
                 num_threads=accelerator_options.num_threads,
             )
@@ -161,7 +162,7 @@ def __call__(
                 yield element
             return
 
-        images: List[Image.Image] = []
+        images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[PictureItem] = []
         for el in element_batch:
             assert isinstance(el, PictureItem)

diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
@@ -150,10 +150,12 @@ def __call__(
             else:
                 with TimeRecorder(conv_res, "layout"):
                     assert page.size is not None
+                    page_image = page.get_image(scale=1.0)
+                    assert page_image is not None
 
                     clusters = []
                     for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page.get_image(scale=1.0))
+                        self.layout_predictor.predict(page_image)
                     ):
                         label = DocItemLabel(
                             pred_item["label"]

diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py
@@ -103,10 +103,10 @@ def main():
     logging.basicConfig(level=logging.INFO)
 
     input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-        Path("./tests/data/2203.01017v2.pdf"),
-        Path("./tests/data/2305.03393v1.pdf"),
-        Path("./tests/data/redp5110_sampled.pdf"),
+        Path("./tests/data/pdf/2206.01062.pdf"),
+        Path("./tests/data/pdf/2203.01017v2.pdf"),
+        Path("./tests/data/pdf/2305.03393v1.pdf"),
+        Path("./tests/data/pdf/redp5110_sampled.pdf"),
     ]
 
     # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())

diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py
@@ -21,7 +21,7 @@
 def main():
     logging.basicConfig(level=logging.INFO)
 
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
 
     ###########################################################################
 

diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py
@@ -68,7 +68,7 @@ def get_default_options(cls) -> ExampleFormulaUnderstandingPipelineOptions:
 def main():
     logging.basicConfig(level=logging.INFO)
 
-    input_doc_path = Path("./tests/data/2203.01017v2.pdf")
+    input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
 
     pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
     pipeline_options.do_formula_understanding = True

diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py
@@ -71,7 +71,7 @@ def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions:
 def main():
     logging.basicConfig(level=logging.INFO)
 
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
 
     pipeline_options = ExamplePictureClassifierPipelineOptions()
     pipeline_options.images_scale = 2.0

diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py
@@ -16,7 +16,7 @@
 def main():
     logging.basicConfig(level=logging.INFO)
 
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
     output_dir = Path("scratch")
 
     # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py
@@ -19,7 +19,7 @@
 def main():
     logging.basicConfig(level=logging.INFO)
 
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
     output_dir = Path("scratch")
 
     # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py
@@ -12,7 +12,7 @@
 def main():
     logging.basicConfig(level=logging.INFO)
 
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
     output_dir = Path("scratch")
 
     doc_converter = DocumentConverter()

diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py
@@ -14,7 +14,7 @@
 
 
 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
 
     pipeline_options = PdfPipelineOptions()
     pipeline_options.do_ocr = True

diff --git a/docs/examples/inspect_picture_content.py b/docs/examples/inspect_picture_content.py
@@ -4,7 +4,7 @@
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
 
-source = "tests/data/amt_handbook_sample.pdf"
+source = "tests/data/pdf/amt_handbook_sample.pdf"
 
 pipeline_options = PdfPipelineOptions()
 pipeline_options.images_scale = 2

diff --git a/docs/examples/run_with_accelerator.py b/docs/examples/run_with_accelerator.py
@@ -14,7 +14,7 @@
 
 
 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
 
     # Explicitly set the accelerator
     # accelerator_options = AcceleratorOptions(

diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py
@@ -25,9 +25,8 @@ def main():
         Path("tests/data/docx/lorem_ipsum.docx"),
         Path("tests/data/pptx/powerpoint_sample.pptx"),
         Path("tests/data/2305.03393v1-pg9-img.png"),
-        Path("tests/data/2206.01062.pdf"),
-        Path("tests/data/test_01.asciidoc"),
-        Path("tests/data/test_01.asciidoc"),
+        Path("tests/data/pdf/2206.01062.pdf"),
+        Path("tests/data/asciidoc/test_01.asciidoc"),
     ]
 
     ## for defaults use:

diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py
@@ -10,7 +10,7 @@
 
 
 def main():
-    input_doc = Path("./tests/data/2206.01062.pdf")
+    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
 
     # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
     # ocr_options = TesseractOcrOptions(lang=["auto"])

diff --git a/docs/examples/translate.py b/docs/examples/translate.py
@@ -32,7 +32,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
 def main():
     logging.basicConfig(level=logging.INFO)
 
-    input_doc_path = Path("./tests/data/2206.01062.pdf")
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
     output_dir = Path("scratch")
 
     # Important: For operating with page images, we must keep them, otherwise the DocumentConverter

diff --git a/docs/v2.md b/docs/v2.md
@@ -117,12 +117,12 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
 ## Convert several files at once:
 
 input_files = [
-    "tests/data/wiki_duck.html",
-    "tests/data/word_sample.docx",
-    "tests/data/lorem_ipsum.docx",
-    "tests/data/powerpoint_sample.pptx",
+    "tests/data/html/wiki_duck.html",
+    "tests/data/docx/word_sample.docx",
+    "tests/data/docx/lorem_ipsum.docx",
+    "tests/data/pptx/powerpoint_sample.pptx",
     "tests/data/2305.03393v1-pg9-img.png",
-    "tests/data/2206.01062.pdf",
+    "tests/data/pdf/2206.01062.pdf",
 ]
 
 # Directly pass list of files or streams to `convert_all`