Revert "PDF and DOCX support in Write File - Feature Improvement, close

#548 (#1125)" This reverts commit 26f6a1d.
TransformerOptimus · Sep 7, 2023 · d13c57a · d13c57a
1 parent 589341e
commit d13c57a
Show file tree

Hide file tree

Showing 10 changed files with 30 additions and 266 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -3,7 +3,7 @@ FROM python:3.10-slim-bullseye AS compile-image
 WORKDIR /app
 
 RUN apt-get update && \
-    apt-get install --no-install-recommends -y wget libpq-dev gcc g++ python3-dev wkhtmltopdf && \
+    apt-get install --no-install-recommends -y wget libpq-dev gcc g++ python3-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
@@ -24,7 +24,7 @@ FROM python:3.10-slim-bullseye AS build-image
 WORKDIR /app
 
 RUN apt-get update && \
-    apt-get install --no-install-recommends -y libpq-dev wkhtmltopdf && \
+    apt-get install --no-install-recommends -y libpq-dev && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 

diff --git a/DockerfileCelery b/DockerfileCelery
@@ -3,8 +3,6 @@ FROM python:3.9
 WORKDIR /app
 
 #RUN apt-get update && apt-get install --no-install-recommends -y git wget libpq-dev gcc python3-dev && pip install psycopg2
-RUN apt-get update && apt-get install -y wkhtmltopdf
-
 RUN pip install --upgrade pip
 
 COPY requirements.txt .

diff --git a/requirements.txt b/requirements.txt
@@ -156,8 +156,5 @@ html2text==2020.1.16
 duckduckgo-search==3.8.3 
 google-generativeai==0.1.0
 unstructured==0.8.1
-beautifulsoup4==4.12.2
-pdfkit==1.0.0
-htmldocx==0.0.6
 ai21==1.2.6
 typing-extensions==4.5.0
diff --git a/superagi/exceptions/__init__.py b/superagi/exceptions/__init__.py
diff --git a/superagi/exceptions/file_exceptions.py b/superagi/exceptions/file_exceptions.py
diff --git a/superagi/resource_manager/file_manager.py b/superagi/resource_manager/file_manager.py
@@ -1,25 +1,18 @@
 import csv
 from sqlalchemy.orm import Session
-import os
-
 from superagi.config.config import get_config
+import os
 from superagi.helper.resource_helper import ResourceHelper
 from superagi.helper.s3_helper import S3Helper
 from superagi.lib.logger import logger
 from superagi.models.agent import Agent
 from superagi.models.agent_execution import AgentExecution
 from superagi.types.storage_types import StorageType
-from superagi.exceptions.file_exceptions import UnsupportedFileTypeError, FileNotCreatedError
-
-import pdfkit
-from htmldocx import HtmlToDocx
-
 class FileManager:
     def __init__(self, session: Session, agent_id: int = None, agent_execution_id: int = None):
         self.session = session
         self.agent_id = agent_id
         self.agent_execution_id = agent_execution_id
-
     def write_binary_file(self, file_name: str, data):
         if self.agent_id is not None:
             final_path = ResourceHelper.get_agent_write_resource_path(file_name,
@@ -39,7 +32,6 @@ def write_binary_file(self, file_name: str, data):
             return f"Binary {file_name} saved successfully"
         except Exception as err:
             return f"Error write_binary_file: {err}"
-
     def write_to_s3(self, file_name, final_path):
         with open(final_path, 'rb') as img:
             resource = ResourceHelper.make_written_file_resource(file_name=file_name,
@@ -63,16 +55,25 @@ def write_file(self, file_name: str, content):
                                                                                                    self.agent_execution_id))
         else:
             final_path = ResourceHelper.get_resource_path(file_name)
-
         try:
-            self.save_file_by_type(file_name=file_name, file_path=final_path, content=content)
+            with open(final_path, mode="w") as file:
+                file.write(content)
+                file.close()
+            self.write_to_s3(file_name, final_path)
+            logger.info(f"{file_name} - File written successfully")
+            return f"{file_name} - File written successfully"
         except Exception as err:
             return f"Error write_file: {err}"
-
-        logger.info(f"{file_name} - File written successfully")
-        return f"{file_name} - File written successfully"
-
-    def write_csv_file(self, file_name: str, final_path: str, csv_data) -> str:
+    def write_csv_file(self, file_name: str, csv_data):
+        if self.agent_id is not None:
+            final_path = ResourceHelper.get_agent_write_resource_path(file_name,
+                                                                      agent=Agent.get_agent_from_id(self.session,
+                                                                                                    self.agent_id),
+                                                                      agent_execution=AgentExecution
+                                                                      .get_agent_execution_from_id(self.session,
+                                                                                                   self.agent_execution_id))
+        else:
+            final_path = ResourceHelper.get_resource_path(file_name)
         try:
             with open(final_path, mode="w", newline="") as file:
                 writer = csv.writer(file, lineterminator="\n")
@@ -81,63 +82,15 @@ def write_csv_file(self, file_name: str, final_path: str, csv_data) -> str:
             logger.info(f"{file_name} - File written successfully")
             return f"{file_name} - File written successfully"
         except Exception as err:
-            raise FileNotCreatedError(file_name=file_name) from err
-
-    def write_pdf_file(self, file_name: str ,file_path: str, content):
-        # Saving the HTML file
-        html_file_path = f"{file_path[:-4]}.html"
-        self.write_txt_file(file_name=html_file_path.split('/')[-1], file_path=html_file_path, content=content)
-
-        # Convert HTML file to a PDF file
-        try:
-            options = {
-                'quiet': '',
-                'page-size': 'Letter',
-                'margin-top': '0.75in',
-                'margin-right': '0.75in',
-                'margin-bottom': '0.75in',
-                'margin-left': '0.75in',
-                'enable-local-file-access': ''
-            }
-            config = pdfkit.configuration(wkhtmltopdf = "/usr/bin/wkhtmltopdf")
-            pdfkit.from_file(html_file_path, file_path, options = options, configuration = config)
-            self.write_to_s3(file_name, file_path)
-            return file_path
+            return f"Error write_csv_file: {err}"
 
-        except Exception as err:
-            raise FileNotCreatedError(file_name=file_name) from err
-
-    def write_docx_file(self, file_name: str ,file_path: str, content):
-        # Saving the HTML file
-        html_file_path = f"{file_path[:-4]}.html"
-        self.write_txt_file(file_name=html_file_path.split('/')[-1], file_path=html_file_path, content=content)
 
-        # Convert HTML file to a DOCx file
-        try:
-            new_parser = HtmlToDocx()
-            new_parser.parse_html_file(html_file_path, file_path)
-            self.write_to_s3(file_name, file_path)
-            return file_path
-        except Exception as err:
-            raise FileNotCreatedError(file_name=file_name) from err
-
-    def write_txt_file(self, file_name: str ,file_path: str, content) -> str:
-        try:
-            with open(file_path, mode="w") as file:
-                file.write(content)
-                file.close()
-            self.write_to_s3(file_name, file_path)
-            return file_path
-        except Exception as err:
-            raise FileNotCreatedError(file_name=file_name) from err
-
     def get_agent_resource_path(self, file_name: str):
         return ResourceHelper.get_agent_write_resource_path(file_name, agent=Agent.get_agent_from_id(self.session,
                                                                                                      self.agent_id),
                                                             agent_execution=AgentExecution
                                                             .get_agent_execution_from_id(self.session,
                                                                                          self.agent_execution_id))
-
     def read_file(self, file_name: str):
         if self.agent_id is not None:
             final_path = self.get_agent_resource_path(file_name)
@@ -151,7 +104,6 @@ def read_file(self, file_name: str):
             return content
         except Exception as err:
             return f"Error while reading file {file_name}: {err}"
-
     def get_files(self):
         """
         Gets all file names generated by the CodingTool.
@@ -170,23 +122,3 @@ def get_files(self):
             logger.error(f"Error while accessing files in {final_path}: {err}")
             files = []
         return files
-
-    def save_file_by_type(self, file_name: str, file_path: str, content):
-
-        # Extract the file type from the file_name
-        file_type = file_name.split('.')[-1].lower()
-
-        # Dictionary to map file types to corresponding functions
-        file_type_handlers = {
-            'txt': self.write_txt_file,
-            'pdf': self.write_pdf_file,
-            'docx': self.write_docx_file, 
-            'doc': self.write_docx_file,
-            'csv': self.write_csv_file,
-            'html': self.write_txt_file
-            # NOTE: Add more file types and corresponding functions as needed, These functions should be defined 
-        }
-
-        if file_type not in file_type_handlers:
-            raise UnsupportedFileTypeError(file_name=file_name, supported_types=list(file_type_handlers))
-
diff --git a/superagi/tools/file/prompts/add_images_to_html.txt b/superagi/tools/file/prompts/add_images_to_html.txt
diff --git a/superagi/tools/file/prompts/content_to_html_prompt.txt b/superagi/tools/file/prompts/content_to_html_prompt.txt
diff --git a/superagi/tools/file/write_file.py b/superagi/tools/file/write_file.py
@@ -1,24 +1,21 @@
 from typing import Type, Optional
-import base64
-import os
 
 from pydantic import BaseModel, Field
 
-from superagi.helper.prompt_reader import PromptReader
-from superagi.helper.resource_helper import ResourceHelper
+# from superagi.helper.s3_helper import upload_to_s3
 from superagi.resource_manager.file_manager import FileManager
 from superagi.tools.base_tool import BaseTool
-from superagi.tools.tool_response_query_manager import ToolResponseQueryManager
-from superagi.llms.base_llm import BaseLlm
-from superagi.models.agent import Agent
-from superagi.models.agent_execution import AgentExecution
+
+
+# from superagi.helper.s3_helper import upload_to_s3
 
 
 class WriteFileInput(BaseModel):
     """Input for CopyFileTool."""
     file_name: str = Field(..., description="Name of the file to write. Only include the file name. Don't include path.")
     content: str = Field(..., description="File content to write")
 
+
 class WriteFileTool(BaseTool):
     """
     Write File tool
@@ -30,13 +27,12 @@ class WriteFileTool(BaseTool):
         args_schema : The args schema.
         resource_manager: File resource manager.
     """
-    llm: Optional[BaseLlm] = None
     name: str = "Write File"
     args_schema: Type[BaseModel] = WriteFileInput
-    description: str = "Writes content in a file. The content can carry text and images."
+    description: str = "Writes text to a file"
+    agent_id: int = None
     resource_manager: Optional[FileManager] = None
-    agent_id:int =None
-    agent_execution_id:int =None
+
     class Config:
         arbitrary_types_allowed = True
 
@@ -51,112 +47,4 @@ def _execute(self, file_name: str, content: str):
         Returns:
             success message if message is file written successfully or failure message if writing file fails.
         """
-
-        attached_files = self._get_attached_files()
-        file_type = file_name.split('.')[-1].lower()
-
-        if file_type not in ['pdf', 'docx', 'doc']:
-            return self.resource_manager.write_file(file_name, content)
-
-        html_code_content = self._convert_content_into_html(content=content, attached_files = attached_files,formated_for=file_type)
-
-        return self.resource_manager.write_file(file_name, html_code_content)
-
-    def _convert_content_into_html(self, content: str, attached_files: list, formated_for: str) -> str:
-        """
-        Converts the content into an HTML file
-        Args:
-            content (str): Content to be beautified and formatted
-            formated_for (str): HTML content to be specifically formatted for a specific document type
-        Returns:
-            HTML Code (str): HTML code of the formated content 
-        """
-        prompt = PromptReader.read_tools_prompt(__file__, "content_to_html_prompt.txt")
-        prompt = prompt.replace("{content}", content)
-
-        if image_file_paths := self._get_file_path_of_images(attached_files=attached_files):
-            embedding_image_prompt = PromptReader.read_tools_prompt(__file__, "add_images_to_html.txt")
-            for idx, image_path in enumerate(image_file_paths):
-                embedding_image_prompt += f"\n{idx+1}. {image_path}"
-            prompt = prompt.replace("{embedding_image}", embedding_image_prompt)
-        else:
-            prompt = prompt.replace("{embedding_image}", "")
-
-        messages = [{"role": "system", "content": prompt}]
-        result = self.llm.chat_completion(messages, max_tokens=self.max_token_limit)
-
-        if formated_for == 'pdf':
-            return self._html_formatting_for_pdf(content=result["content"], image_list=image_file_paths)
-        return result['content']
-
-    def _get_file_path_of_images(self, attached_files: list):
-        """
-        Filters Images from the attached files list and finds out the corresponding paths. 
-
-        Args:
-            attached_files: List of names of files generated
-
-        Returns:
-            Full paths of the image files
-        """
-        image_extensions, image_paths = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'],[]
-        for file_name in attached_files:
-            path = ResourceHelper().get_agent_read_resource_path(
-                    file_name,
-                    agent=Agent.get_agent_from_id(self.toolkit_config.session, self.agent_id),
-                    agent_execution=AgentExecution.get_agent_execution_from_id(
-                        self.toolkit_config.session, self.agent_execution_id
-                    ),
-                )
-            if not os.path.exists(path):
-                continue
-            _, file_extension = os.path.splitext(path)
-            # Check if the file extension is in the list of image extensions
-            if file_extension.lower() in image_extensions:
-                image_paths.append(path)
-        return image_paths
-
-
-    def _image_to_base64(self, image_path):
-        with open(image_path, "rb") as image_file:
-            encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
-        return encoded_image
-
-    def _html_formatting_for_pdf(self, content: str, image_list: list) -> str:
-        """
-        Converts image paths into base64 inputs in an HTML file
-        Args:
-            content (str): HTML code
-            image_list (list): List of images to be converted and replaced
-        Returns:
-            HTML Code (str): Formatted HTML code 
-        """
-
-        for image_path in image_list:
-            content = content.replace(f"{image_path}", f"data:image/png;base64,{self._image_to_base64(image_path=image_path)}")
-        return content
-
-    def _get_attached_files(self):
-        output_directory = ResourceHelper.get_root_output_dir()
-        if "{agent_id}" not in output_directory:
-            return []
-        output_directory = ResourceHelper.get_formatted_agent_level_path(agent=Agent
-                                                                        .get_agent_from_id(session=self
-                                                                                           .toolkit_config.session,
-                                                                                           agent_id=self.agent_id),
-                                                                        path=output_directory)
-        agent_execution=AgentExecution.get_agent_execution_from_id(session=self.toolkit_config.session, agent_execution_id=self.agent_execution_id)
-        if agent_execution is not None and "{agent_execution_id}" in output_directory:
-            output_directory = ResourceHelper.get_formatted_agent_execution_level_path(agent_execution=agent_execution, path=output_directory)
-
-        return self._list_files(output_directory)
-
-    def _list_files(self, directory):
-        found_files = []
-        for root, dirs, files in os.walk(directory):
-            found_files.extend(
-                file
-                for file in files
-                if not file.startswith(".") and "__pycache__" not in root
-            )
-        return found_files
+        return self.resource_manager.write_file(file_name, content)