From e4f686deb71e59b8d36a6c31a5480a676f522a34 Mon Sep 17 00:00:00 2001 From: Vikey Chen Date: Wed, 3 Apr 2024 21:00:20 +0800 Subject: [PATCH] fix unstructured api,remove unused parameters (#3056) --- .../rag/extractor/unstructured/unstructured_eml_extractor.py | 2 +- .../extractor/unstructured/unstructured_markdown_extractor.py | 2 +- .../rag/extractor/unstructured/unstructured_msg_extractor.py | 2 +- .../rag/extractor/unstructured/unstructured_ppt_extractor.py | 4 ++-- .../rag/extractor/unstructured/unstructured_pptx_extractor.py | 2 +- .../rag/extractor/unstructured/unstructured_text_extractor.py | 2 +- .../rag/extractor/unstructured/unstructured_xml_extractor.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py index f6ae8fad533f7d..2e704f187d05d6 100644 --- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py @@ -26,7 +26,7 @@ def __init__( def extract(self) -> list[Document]: from unstructured.partition.email import partition_email - elements = partition_email(filename=self._file_path, api_url=self._api_url) + elements = partition_email(filename=self._file_path) # noinspection PyBroadException try: diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py index 3d63446fef15e6..144b4e0c1d7a91 100644 --- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py @@ -36,7 +36,7 @@ def __init__( def extract(self) -> list[Document]: from unstructured.partition.md import partition_md - elements = partition_md(filename=self._file_path, api_url=self._api_url) + elements = partition_md(filename=self._file_path) from unstructured.chunking.title import chunk_by_title chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py index 34d3e8021a61ac..ad09b79eb00a07 100644 --- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py @@ -26,7 +26,7 @@ def __init__( def extract(self) -> list[Document]: from unstructured.partition.msg import partition_msg - elements = partition_msg(filename=self._file_path, api_url=self._api_url) + elements = partition_msg(filename=self._file_path) from unstructured.chunking.title import chunk_by_title chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] diff --git a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py index cd3aba9866a17c..6d3ffe6589c9f5 100644 --- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py @@ -24,9 +24,9 @@ def __init__( self._api_url = api_url def extract(self) -> list[Document]: - from unstructured.partition.ppt import partition_ppt + from unstructured.partition.api import partition_via_api - elements = partition_ppt(filename=self._file_path, api_url=self._api_url) + elements = partition_via_api(filename=self._file_path, api_url=self._api_url) text_by_page = {} for element in elements: page = element.metadata.page_number diff --git a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py index f9667d2527fbc4..6fcbb5feb991d0 100644 --- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py @@ -26,7 +26,7 @@ def __init__( def extract(self) -> list[Document]: from unstructured.partition.pptx import partition_pptx - elements = partition_pptx(filename=self._file_path, api_url=self._api_url) + elements = partition_pptx(filename=self._file_path) text_by_page = {} for element in elements: page = element.metadata.page_number diff --git a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py index cc67f2b8665fa1..f4a4adbc1600fd 100644 --- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py @@ -26,7 +26,7 @@ def __init__( def extract(self) -> list[Document]: from unstructured.partition.text import partition_text - elements = partition_text(filename=self._file_path, api_url=self._api_url) + elements = partition_text(filename=self._file_path) from unstructured.chunking.title import chunk_by_title chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py index 5600fb075d2ac1..6aef8e0f7e2718 100644 --- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py @@ -26,7 +26,7 @@ def __init__( def extract(self) -> list[Document]: from unstructured.partition.xml import partition_xml - elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url) + elements = partition_xml(filename=self._file_path, xml_keep_tags=True) from unstructured.chunking.title import chunk_by_title chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = []