From 0fe40a2207ca2b88b5c19d57a26718388411c430 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Fri, 17 Jan 2025 13:41:38 +0100 Subject: [PATCH] Optimise tests --- .../document_loaders/pymupdf.ipynb | 472 ++++++++++-------- .../document_loaders/parsers/pdf.py | 2 +- .../document_loaders/pdf.py | 2 +- .../parsers/test_pdf_parsers.py | 45 +- .../document_loaders/test_pdf.py | 11 +- 5 files changed, 329 insertions(+), 203 deletions(-) diff --git a/docs/docs/integrations/document_loaders/pymupdf.ipynb b/docs/docs/integrations/document_loaders/pymupdf.ipynb index 81893dc7d684f6..eba73598d959dd 100644 --- a/docs/docs/integrations/document_loaders/pymupdf.ipynb +++ b/docs/docs/integrations/document_loaders/pymupdf.ipynb @@ -35,24 +35,26 @@ ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:" + "metadata": {}, + "source": [ + "If you want to get automated best in-class tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:" + ] }, { + "cell_type": "code", + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:06:39.287984Z", - "start_time": "2025-01-17T11:06:39.285720Z" + "end_time": "2025-01-17T13:24:30.653579Z", + "start_time": "2025-01-17T13:24:30.650990Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n", "# os.environ[\"LANGSMITH_TRACING\"] = \"true\"" - ], - "outputs": [], - "execution_count": 1 + ] }, { "cell_type": "markdown", @@ -65,13 +67,13 @@ }, { "cell_type": "code", + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:06:42.183569Z", - "start_time": "2025-01-17T11:06:40.528770Z" + "end_time": "2025-01-17T13:24:33.695776Z", + "start_time": "2025-01-17T13:24:31.737888Z" } }, - "source": "%pip install -qU langchain_community pymupdf", "outputs": [ { "name": "stdout", @@ -81,7 +83,9 @@ ] } ], - "execution_count": 2 + "source": [ + "%pip install -qU langchain_community pymupdf" + ] }, { "cell_type": "markdown", @@ -94,20 +98,20 @@ }, { "cell_type": "code", + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:44.403523Z", "start_time": "2025-01-17T11:06:43.736030Z" } }, + "outputs": [], "source": [ "from langchain_community.document_loaders import PyMuPDFLoader\n", "\n", "file_path = \"./example_data/layout-parser-paper.pdf\"\n", "loader = PyMuPDFLoader(file_path)" - ], - "outputs": [], - "execution_count": 3 + ] }, { "cell_type": "markdown", @@ -118,16 +122,13 @@ }, { "cell_type": "code", + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:46.138267Z", "start_time": "2025-01-17T11:06:46.001187Z" } }, - "source": [ - "docs = loader.load()\n", - "docs[0]" - ], "outputs": [ { "data": { @@ -140,21 +141,20 @@ "output_type": "execute_result" } ], - "execution_count": 4 + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] }, { "cell_type": "code", + "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:46.646335Z", "start_time": "2025-01-17T11:06:46.642667Z" } }, - "source": [ - "import pprint\n", - "\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -177,7 +177,11 @@ ] } ], - "execution_count": 5 + "source": [ + "import pprint\n", + "\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -188,23 +192,13 @@ }, { "cell_type": "code", + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:48.147692Z", "start_time": "2025-01-17T11:06:48.094257Z" } }, - "source": [ - "pages = []\n", - "for doc in loader.lazy_load():\n", - " pages.append(doc)\n", - " if len(pages) >= 10:\n", - " # do some paged operation, e.g.\n", - " # index.upsert(page)\n", - "\n", - " pages = []\n", - "len(pages)" - ], "outputs": [ { "data": { @@ -217,20 +211,27 @@ "output_type": "execute_result" } ], - "execution_count": 6 + "source": [ + "pages = []\n", + "for doc in loader.lazy_load():\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", + "\n", + " pages = []\n", + "len(pages)" + ] }, { "cell_type": "code", + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:50.003790Z", "start_time": "2025-01-17T11:06:50.000060Z" } }, - "source": [ - "print(pages[0].page_content[:100])\n", - "pprint.pp(pages[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -256,7 +257,10 @@ ] } ], - "execution_count": 7 + "source": [ + "print(pages[0].page_content[:100])\n", + "pprint.pp(pages[0].metadata)" + ] }, { "cell_type": "markdown", @@ -301,21 +305,13 @@ }, { "cell_type": "code", + "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:53.613494Z", "start_time": "2025-01-17T11:06:53.563930Z" } }, - "source": [ - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - ")\n", - "docs = loader.load()\n", - "print(len(docs))\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -339,7 +335,15 @@ ] } ], - "execution_count": 8 + "source": [ + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -357,21 +361,13 @@ }, { "cell_type": "code", + "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:06:55.955935Z", "start_time": "2025-01-17T11:06:55.903604Z" } }, - "source": [ - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"single\",\n", - ")\n", - "docs = loader.load()\n", - "print(len(docs))\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -394,7 +390,15 @@ ] } ], - "execution_count": 9 + "source": [ + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"single\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -406,25 +410,19 @@ { "cell_type": "markdown", "metadata": {}, - "source": "### Add a custom *pages_delimiter* to identify where are ends of pages in *single* mode:" + "source": [ + "### Add a custom *pages_delimiter* to identify where are ends of pages in *single* mode:" + ] }, { + "cell_type": "code", + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:07:31.932597Z", "start_time": "2025-01-17T11:07:31.885499Z" } }, - "cell_type": "code", - "source": [ - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"single\",\n", - " pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", - ")\n", - "docs = loader.load()\n", - "print(docs[0].page_content[:5780])" - ], "outputs": [ { "name": "stdout", @@ -528,7 +526,15 @@ ] } ], - "execution_count": 11 + "source": [ + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"single\",\n", + " pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content[:5780])" + ] }, { "cell_type": "markdown", @@ -567,15 +573,13 @@ }, { "cell_type": "code", + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:07:39.281686Z", "start_time": "2025-01-17T11:07:37.500638Z" } }, - "source": [ - "%pip install -qU rapidocr-onnxruntime" - ], "outputs": [ { "name": "stdout", @@ -585,29 +589,20 @@ ] } ], - "execution_count": 12 + "source": [ + "%pip install -qU rapidocr-onnxruntime" + ] }, { + "cell_type": "code", + "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:08:46.036783Z", "start_time": "2025-01-17T11:08:22.713011Z" - } + }, + "scrolled": true }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders.parsers import RapidOCRBlobParser\n", - "\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " images_inner_format=\"markdown-img\",\n", - " images_parser=RapidOCRBlobParser(),\n", - ")\n", - "docs = loader.load()\n", - "\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stdout", @@ -684,7 +679,19 @@ ] } ], - "execution_count": 14 + "source": [ + "from langchain_community.document_loaders.parsers import RapidOCRBlobParser\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " images_inner_format=\"markdown-img\",\n", + " images_parser=RapidOCRBlobParser(),\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -702,15 +709,13 @@ }, { "cell_type": "code", + "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:08:53.698734Z", "start_time": "2025-01-17T11:08:52.248547Z" } }, - "source": [ - "%pip install -qU pytesseract" - ], "outputs": [ { "name": "stdout", @@ -720,28 +725,19 @@ ] } ], - "execution_count": 15 + "source": [ + "%pip install -qU pytesseract" + ] }, { + "cell_type": "code", + "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:03.699153Z", "start_time": "2025-01-17T11:08:55.660127Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders.parsers import TesseractBlobParser\n", - "\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " images_inner_format=\"html-img\",\n", - " images_parser=TesseractBlobParser(),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stdout", @@ -818,7 +814,18 @@ ] } ], - "execution_count": 16 + "source": [ + "from langchain_community.document_loaders.parsers import TesseractBlobParser\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " images_inner_format=\"html-img\",\n", + " images_parser=TesseractBlobParser(),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -829,15 +836,13 @@ }, { "cell_type": "code", + "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:08.637429Z", "start_time": "2025-01-17T11:09:07.177157Z" } }, - "source": [ - "%pip install -qU langchain_openai" - ], "outputs": [ { "name": "stdout", @@ -847,23 +852,19 @@ ] } ], - "execution_count": 17 + "source": [ + "%pip install -qU langchain_openai" + ] }, { "cell_type": "code", + "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:09.670266Z", "start_time": "2025-01-17T11:09:09.634422Z" } }, - "source": [ - "import os\n", - "\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" - ], "outputs": [ { "data": { @@ -871,52 +872,45 @@ "True" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 18 + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] }, { "cell_type": "code", + "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2025-01-17T11:09:11.652399Z", "start_time": "2025-01-17T11:09:11.649497Z" } }, + "outputs": [], "source": [ "from getpass import getpass\n", "\n", "if not os.environ.get(\"OPENAI_API_KEY\"):\n", " os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key =\")" - ], - "outputs": [], - "execution_count": 19 + ] }, { + "cell_type": "code", + "execution_count": 19, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:10:15.732342Z", + "end_time": "2025-01-17T12:46:33.398682Z", "start_time": "2025-01-17T11:09:14.102369Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders.parsers import LLMImageBlobParser\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " images_inner_format=\"markdown-img\",\n", - " images_parser=LLMImageBlobParser(\n", - " model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stdout", @@ -954,22 +948,19 @@ "\n", "\n", "\n", - "![**Image Summary for Retrieval:**\n", - "\n", - "Diagram illustrating transformation and operation APIs with elements like coordinates, rectangles, quadrilaterals, text blocks, and layout lists. Includes extra features such as block text, type, and reading order.\n", + "![**Image Summary:** Diagram illustrating coordinate systems and textblock features in layout processing. Includes intervals, rectangles, quadrilaterals, and extra features. Textblock elements feature block text, type, and reading order, all transformed by the same APIs.\n", "\n", "**Extracted Text:**\n", "\n", "Coordinate\n", - "\n", "Coordinate\n", "\n", - "x-interval\n", - "\n", "start\n", "\n", "start\n", "\n", + "x-interval\n", + "\n", "end\n", "\n", "y-interval\n", @@ -984,14 +975,16 @@ "\n", "(x1, y1)\n", "\n", - "( x2, y2)\n", - "\n", "Quadrilateral\n", "\n", + "(x2, y2)\n", + "\n", "(x4, y4)\n", "\n", "(x3, y3)\n", "\n", + "→\n", + "\n", "The same transformation and operation APIs\n", "\n", "textblock\n", @@ -1010,6 +1003,8 @@ "\n", "...\n", "\n", + "→\n", + "\n", "layout\n", "\n", "[ coordinate1, textblock1, ...\n", @@ -1020,7 +1015,19 @@ ] } ], - "execution_count": 20 + "source": [ + "from langchain_community.document_loaders.parsers import LLMImageBlobParser\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " images_inner_format=\"markdown-img\",\n", + " images_parser=LLMImageBlobParser(model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -1037,40 +1044,113 @@ ] }, { + "cell_type": "code", + "execution_count": 20, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:12:07.687810Z", - "start_time": "2025-01-17T11:12:06.352661Z" + "end_time": "2025-01-17T12:46:34.812794Z", + "start_time": "2025-01-17T12:46:33.475764Z" } }, - "cell_type": "code", - "source": [ - "from IPython.display import display, Markdown\n", - "loader = PyMuPDFLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " mode=\"page\",\n", - " extract_tables=\"markdown\",\n", - ")\n", - "docs = loader.load()\n", - "display(Markdown(docs[4].page_content))" - ], "outputs": [ { "data": { + "text/markdown": [ + "LayoutParser: A Unified Toolkit for DL-Based DIA\n", + "5\n", + "Table 1: Current layout detection models in the LayoutParser model zoo\n", + "Dataset\n", + "Base Model1 Large Model\n", + "Notes\n", + "PubLayNet [38]\n", + "F / M\n", + "M\n", + "Layouts of modern scientific documents\n", + "PRImA [3]\n", + "M\n", + "-\n", + "Layouts of scanned modern magazines and scientific reports\n", + "Newspaper [17]\n", + "F\n", + "-\n", + "Layouts of scanned US newspapers from the 20th century\n", + "TableBank [18]\n", + "F\n", + "F\n", + "Table region on modern scientific and business document\n", + "HJDataset [31]\n", + "F / M\n", + "-\n", + "Layouts of history Japanese documents\n", + "1 For each dataset, we train several models of different sizes for different needs (the trade-offbetween accuracy\n", + "vs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101\n", + "backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask\n", + "R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\n", + "using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\n", + "zoo in coming months.\n", + "layout data structures, which are optimized for efficiency and versatility. 3) When\n", + "necessary, users can employ existing or customized OCR models via the unified\n", + "API provided in the OCR module. 4) LayoutParser comes with a set of utility\n", + "functions for the visualization and storage of the layout data. 5) LayoutParser\n", + "is also highly customizable, via its integration with functions for layout data\n", + "annotation and model training. We now provide detailed descriptions for each\n", + "component.\n", + "3.1\n", + "Layout Detection Models\n", + "In LayoutParser, a layout model takes a document image as an input and\n", + "generates a list of rectangular boxes for the target content regions. Different\n", + "from traditional methods, it relies on deep convolutional neural networks rather\n", + "than manually curated rules to identify content regions. It is formulated as an\n", + "object detection problem and state-of-the-art models like Faster R-CNN [28] and\n", + "Mask R-CNN [12] are used. This yields prediction results of high accuracy and\n", + "makes it possible to build a concise, generalized interface for layout detection.\n", + "LayoutParser, built upon Detectron2 [35], provides a minimal API that can\n", + "perform layout detection with only four lines of code in Python:\n", + "1 import\n", + "layoutparser as lp\n", + "2 image = cv2.imread(\"image_file\") # load\n", + "images\n", + "3 model = lp. Detectron2LayoutModel (\n", + "4\n", + "\"lp:// PubLayNet/ faster_rcnn_R_50_FPN_3x /config\")\n", + "5 layout = model.detect(image)\n", + "LayoutParser provides a wealth of pre-trained model weights using various\n", + "datasets covering different languages, time periods, and document types. Due to\n", + "domain shift [7], the prediction performance can notably drop when models are ap-\n", + "plied to target samples that are significantly different from the training dataset. As\n", + "document structures and layouts vary greatly in different domains, it is important\n", + "to select models trained on a dataset similar to the test samples. A semantic syntax\n", + "is used for initializing the model weights in LayoutParser, using both the dataset\n", + "name and model name lp:///.\n", + "\n", + "\n", + "|Dataset|Base Model1|Large Model|Notes|\n", + "|---|---|---|---|\n", + "|PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]|F / M M F F F / M|M - - F -|Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents|" + ], "text/plain": [ "" - ], - "text/markdown": "LayoutParser: A Unified Toolkit for DL-Based DIA\n5\nTable 1: Current layout detection models in the LayoutParser model zoo\nDataset\nBase Model1 Large Model\nNotes\nPubLayNet [38]\nF / M\nM\nLayouts of modern scientific documents\nPRImA [3]\nM\n-\nLayouts of scanned modern magazines and scientific reports\nNewspaper [17]\nF\n-\nLayouts of scanned US newspapers from the 20th century\nTableBank [18]\nF\nF\nTable region on modern scientific and business document\nHJDataset [31]\nF / M\n-\nLayouts of history Japanese documents\n1 For each dataset, we train several models of different sizes for different needs (the trade-offbetween accuracy\nvs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101\nbackbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask\nR-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\nusing the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\nzoo in coming months.\nlayout data structures, which are optimized for efficiency and versatility. 3) When\nnecessary, users can employ existing or customized OCR models via the unified\nAPI provided in the OCR module. 4) LayoutParser comes with a set of utility\nfunctions for the visualization and storage of the layout data. 5) LayoutParser\nis also highly customizable, via its integration with functions for layout data\nannotation and model training. We now provide detailed descriptions for each\ncomponent.\n3.1\nLayout Detection Models\nIn LayoutParser, a layout model takes a document image as an input and\ngenerates a list of rectangular boxes for the target content regions. Different\nfrom traditional methods, it relies on deep convolutional neural networks rather\nthan manually curated rules to identify content regions. It is formulated as an\nobject detection problem and state-of-the-art models like Faster R-CNN [28] and\nMask R-CNN [12] are used. This yields prediction results of high accuracy and\nmakes it possible to build a concise, generalized interface for layout detection.\nLayoutParser, built upon Detectron2 [35], provides a minimal API that can\nperform layout detection with only four lines of code in Python:\n1 import\nlayoutparser as lp\n2 image = cv2.imread(\"image_file\") # load\nimages\n3 model = lp. Detectron2LayoutModel (\n4\n\"lp:// PubLayNet/ faster_rcnn_R_50_FPN_3x /config\")\n5 layout = model.detect(image)\nLayoutParser provides a wealth of pre-trained model weights using various\ndatasets covering different languages, time periods, and document types. Due to\ndomain shift [7], the prediction performance can notably drop when models are ap-\nplied to target samples that are significantly different from the training dataset. As\ndocument structures and layouts vary greatly in different domains, it is important\nto select models trained on a dataset similar to the test samples. A semantic syntax\nis used for initializing the model weights in LayoutParser, using both the dataset\nname and model name lp:///.\n\n\n|Dataset|Base Model1|Large Model|Notes|\n|---|---|---|---|\n|PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31]|F / M M F F F / M|M - - F -|Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents|" + ] }, "metadata": {}, "output_type": "display_data" } ], - "execution_count": 24 + "source": [ + "from IPython.display import Markdown, display\n", + "\n", + "loader = PyMuPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " extract_tables=\"markdown\",\n", + ")\n", + "docs = loader.load()\n", + "display(Markdown(docs[4].page_content))" + ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## Working with Files\n", "\n", @@ -1081,29 +1161,14 @@ ] }, { + "cell_type": "code", + "execution_count": 21, "metadata": { "ExecuteTime": { - "end_time": "2025-01-17T11:12:26.844599Z", - "start_time": "2025-01-17T11:12:26.789346Z" + "end_time": "2025-01-17T12:46:34.866868Z", + "start_time": "2025-01-17T12:46:34.819048Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders import FileSystemBlobLoader\n", - "from langchain_community.document_loaders.generic import GenericLoader\n", - "from langchain_community.document_loaders.parsers import PyMuPDFParser\n", - "\n", - "loader = GenericLoader(\n", - " blob_loader=FileSystemBlobLoader(\n", - " path=\"./example_data/\",\n", - " glob=\"*.pdf\",\n", - " ),\n", - " blob_parser=PyMuPDFParser(),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[0].page_content)\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -1168,16 +1233,35 @@ ] } ], - "execution_count": 25 + "source": [ + "from langchain_community.document_loaders import FileSystemBlobLoader\n", + "from langchain_community.document_loaders.generic import GenericLoader\n", + "from langchain_community.document_loaders.parsers import PyMuPDFParser\n", + "\n", + "loader = GenericLoader(\n", + " blob_loader=FileSystemBlobLoader(\n", + " path=\"./example_data/\",\n", + " glob=\"*.pdf\",\n", + " ),\n", + " blob_parser=PyMuPDFParser(),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content)\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", "metadata": {}, - "source": "It is possible to work with files from cloud storage." + "source": [ + "It is possible to work with files from cloud storage." + ] }, { "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ "from langchain_community.document_loaders import CloudBlobLoader\n", "from langchain_community.document_loaders.generic import GenericLoader\n", @@ -1192,9 +1276,7 @@ "docs = loader.load()\n", "print(docs[0].page_content)\n", "pprint.pp(docs[0].metadata)" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 4eb493cf41e935..254849df802738 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -466,7 +466,7 @@ class PyMuPDFParser(BaseBlobParser): parser = PyMuPDFParser( # password = None, mode = "single", - pages_delimitor = "\n\f", + pages_delimiter = "\n\f", # extract_images = True, # images_parser = TesseractBlobParser(), # extract_tables="markdown", diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index bd98d72db922d3..af0aa86b4b5b39 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -495,7 +495,7 @@ def __init__( pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, extract_images: bool = False, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format:str="text", + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, headers: Optional[dict] = None, extract_tables_settings: Optional[dict[str, Any]] = None, diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index ee0fe365885bb5..44cc8294643f1b 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -143,13 +143,31 @@ def _analyze_image(self, img: "Image") -> str: @pytest.mark.parametrize( - "mode", - ["single", "page"], + "mode,image_parser", + [("single", EmptyImageBlobParser()), ("page", None)], ) @pytest.mark.parametrize( - "image_parser", - [EmptyImageBlobParser(), None], + "parser_factory,params", + [ + ("PyMuPDFParser", {}), + ], ) +@pytest.mark.requires("pillow") +def test_mode_and_extract_images_variations( + parser_factory: str, + params: dict, + mode: str, + image_parser: BaseImageBlobParser, +) -> None: + _test_matrix( + parser_factory, + params, + mode, + image_parser, + images_inner_format="text", + ) + + @pytest.mark.parametrize( "images_inner_format", ["text", "markdown-img", "html-img"], @@ -161,7 +179,24 @@ def _analyze_image(self, img: "Image") -> str: ], ) @pytest.mark.requires("pillow") -def test_mode_and_extract_images_variations( +def test_mode_and_image_formats_variations( + parser_factory: str, + params: dict, + images_inner_format: str, +) -> None: + mode = "single" + image_parser = EmptyImageBlobParser() + + _test_matrix( + parser_factory, + params, + mode, + image_parser, + images_inner_format, + ) + + +def _test_matrix( parser_factory: str, params: dict, mode: str, diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index a681dce8c59c01..7eae7ef710d429 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -226,7 +226,16 @@ def test_standard_parameters( assert len(docs) == 1 file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = loader_class(file_path, mode="page") + loader = loader_class( + file_path, + mode="page", + page_delimiter="---", + images_parser=None, + images_inner_format="text", + password=None, + extract_tables=None, + extract_tables_settings=None, + ) docs = loader.load() assert len(docs) == 16 assert loader.web_path is None