From 4cda348cf87f56ff237e376b03732b1b47a99215 Mon Sep 17 00:00:00 2001 From: MDW Date: Fri, 19 May 2023 16:23:09 +0200 Subject: [PATCH] Fix #294 (tested) --- ingest.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/ingest.py b/ingest.py index f22610228..12049abca 100644 --- a/ingest.py +++ b/ingest.py @@ -24,6 +24,28 @@ from constants import CHROMA_SETTINGS +class MyElmLoader(UnstructuredEmailLoader): + """Wrapper to fallback to text/plain when default does not work""" + + def load(self) -> List[Document]: + """Wrapper adding fallback for elm without html""" + try: + try: + doc = UnstructuredEmailLoader.load(self) + except ValueError as e: + if 'text/html content not found in email' in str(e): + # Try plain text + self.unstructured_kwargs["content_source"]="text/plain" + doc = UnstructuredEmailLoader.load(self) + else: + raise + except Exception as e: + # Add file_path to exception message + raise type(e)(f"{self.file_path}: {e}") from e + + return doc + + # Map file extensions to document loaders and their arguments LOADER_MAPPING = { ".csv": (CSVLoader, {}), @@ -47,24 +69,6 @@ load_dotenv() -class MyElmLoader(UnstructuredEmailLoader): - """Wrapper to fallback to text/plain when default does not work""" - - def load(self) -> List[Document]: - """Wrapper adding fallback for elm without html""" - try: - doc = UnstructuredEmailLoader.load() - except ValueError as e: - if 'text/html content not found in email' in str(e): - # Try plain text - self.unstructured_kwargs["content_source"]="text/plain" - doc = UnstructuredEmailLoader.load() - else: - raise - - return doc - - def load_single_document(file_path: str) -> Document: ext = "." + file_path.rsplit(".", 1)[-1] if ext in LOADER_MAPPING: