From 5d8772d3e89a7d01417aa7904451bf3f6032af96 Mon Sep 17 00:00:00 2001 From: Rounak Bhatia Date: Thu, 14 Sep 2023 14:27:17 +0530 Subject: [PATCH] read_tool_fix --- superagi/helper/validate_csv.py | 27 +++++++++++++++++++++++++++ superagi/tools/file/read_file.py | 6 ++++++ 2 files changed, 33 insertions(+) create mode 100644 superagi/helper/validate_csv.py diff --git a/superagi/helper/validate_csv.py b/superagi/helper/validate_csv.py new file mode 100644 index 000000000..5e1c6fd46 --- /dev/null +++ b/superagi/helper/validate_csv.py @@ -0,0 +1,27 @@ +import csv +import pandas as pd +import chardet +from superagi.lib.logger import logger + +def correct_csv_encoding(file_path): + with open(file_path, 'rb') as f: + result = chardet.detect(f.read()) + encoding = result['encoding'] + + if encoding != 'utf-8': + data = [] + with open(file_path, 'r', encoding=encoding) as f: + reader = csv.reader(f, delimiter=';', quotechar='"') + for row in reader: + try: + data.append(row) + except Exception as e: + logger.error(f"An error occurred while processing the file: {e}") + continue + + df = pd.DataFrame(data) + + df.to_csv(file_path, encoding='utf-8', index=False) + logger.info("File is converted to utf-8 encoding.") + else: + logger.info("File is already in utf-8 encoding.") \ No newline at end of file diff --git a/superagi/tools/file/read_file.py b/superagi/tools/file/read_file.py index 7c4177438..c984a7637 100644 --- a/superagi/tools/file/read_file.py +++ b/superagi/tools/file/read_file.py @@ -7,6 +7,7 @@ from pydantic import BaseModel, Field from ebooklib import epub +from superagi.helper.validate_csv import correct_csv_encoding from superagi.helper.resource_helper import ResourceHelper from superagi.helper.s3_helper import S3Helper @@ -17,6 +18,7 @@ from superagi.types.storage_types import StorageType from superagi.config.config import get_config from unstructured.partition.auto import partition +from superagi.lib.logger import logger class ReadFileSchema(BaseModel): """Input for CopyFileTool.""" @@ -89,7 +91,11 @@ def _execute(self, file_name: str): content = "\n".join(content) else: + logger.info(final_path) + if final_path.endswith('.csv'): + correct_csv_encoding(final_path) elements = partition(final_path) + logger.info(elements) content = "\n\n".join([str(el) for el in elements]) if temporary_file_path is not None: