Skip to content

Commit

Permalink
Revert "PDF and DOCX support in Write File - Feature Improvement, close
Browse files Browse the repository at this point in the history
#548 (#1125)"

This reverts commit 26f6a1d.
  • Loading branch information
luciferlinx101 authored Sep 7, 2023
1 parent 589341e commit d13c57a
Show file tree
Hide file tree
Showing 10 changed files with 30 additions and 266 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM python:3.10-slim-bullseye AS compile-image
WORKDIR /app

RUN apt-get update && \
apt-get install --no-install-recommends -y wget libpq-dev gcc g++ python3-dev wkhtmltopdf && \
apt-get install --no-install-recommends -y wget libpq-dev gcc g++ python3-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

Expand All @@ -24,7 +24,7 @@ FROM python:3.10-slim-bullseye AS build-image
WORKDIR /app

RUN apt-get update && \
apt-get install --no-install-recommends -y libpq-dev wkhtmltopdf && \
apt-get install --no-install-recommends -y libpq-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

Expand Down
2 changes: 0 additions & 2 deletions DockerfileCelery
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ FROM python:3.9
WORKDIR /app

#RUN apt-get update && apt-get install --no-install-recommends -y git wget libpq-dev gcc python3-dev && pip install psycopg2
RUN apt-get update && apt-get install -y wkhtmltopdf

RUN pip install --upgrade pip

COPY requirements.txt .
Expand Down
3 changes: 0 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,5 @@ html2text==2020.1.16
duckduckgo-search==3.8.3
google-generativeai==0.1.0
unstructured==0.8.1
beautifulsoup4==4.12.2
pdfkit==1.0.0
htmldocx==0.0.6
ai21==1.2.6
typing-extensions==4.5.0
Empty file removed superagi/exceptions/__init__.py
Empty file.
10 changes: 0 additions & 10 deletions superagi/exceptions/file_exceptions.py

This file was deleted.

104 changes: 18 additions & 86 deletions superagi/resource_manager/file_manager.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,18 @@
import csv
from sqlalchemy.orm import Session
import os

from superagi.config.config import get_config
import os
from superagi.helper.resource_helper import ResourceHelper
from superagi.helper.s3_helper import S3Helper
from superagi.lib.logger import logger
from superagi.models.agent import Agent
from superagi.models.agent_execution import AgentExecution
from superagi.types.storage_types import StorageType
from superagi.exceptions.file_exceptions import UnsupportedFileTypeError, FileNotCreatedError

import pdfkit
from htmldocx import HtmlToDocx

class FileManager:
def __init__(self, session: Session, agent_id: int = None, agent_execution_id: int = None):
self.session = session
self.agent_id = agent_id
self.agent_execution_id = agent_execution_id

def write_binary_file(self, file_name: str, data):
if self.agent_id is not None:
final_path = ResourceHelper.get_agent_write_resource_path(file_name,
Expand All @@ -39,7 +32,6 @@ def write_binary_file(self, file_name: str, data):
return f"Binary {file_name} saved successfully"
except Exception as err:
return f"Error write_binary_file: {err}"

def write_to_s3(self, file_name, final_path):
with open(final_path, 'rb') as img:
resource = ResourceHelper.make_written_file_resource(file_name=file_name,
Expand All @@ -63,16 +55,25 @@ def write_file(self, file_name: str, content):
self.agent_execution_id))
else:
final_path = ResourceHelper.get_resource_path(file_name)

try:
self.save_file_by_type(file_name=file_name, file_path=final_path, content=content)
with open(final_path, mode="w") as file:
file.write(content)
file.close()
self.write_to_s3(file_name, final_path)
logger.info(f"{file_name} - File written successfully")
return f"{file_name} - File written successfully"
except Exception as err:
return f"Error write_file: {err}"

logger.info(f"{file_name} - File written successfully")
return f"{file_name} - File written successfully"

def write_csv_file(self, file_name: str, final_path: str, csv_data) -> str:
def write_csv_file(self, file_name: str, csv_data):
if self.agent_id is not None:
final_path = ResourceHelper.get_agent_write_resource_path(file_name,
agent=Agent.get_agent_from_id(self.session,
self.agent_id),
agent_execution=AgentExecution
.get_agent_execution_from_id(self.session,
self.agent_execution_id))
else:
final_path = ResourceHelper.get_resource_path(file_name)
try:
with open(final_path, mode="w", newline="") as file:
writer = csv.writer(file, lineterminator="\n")
Expand All @@ -81,63 +82,15 @@ def write_csv_file(self, file_name: str, final_path: str, csv_data) -> str:
logger.info(f"{file_name} - File written successfully")
return f"{file_name} - File written successfully"
except Exception as err:
raise FileNotCreatedError(file_name=file_name) from err

def write_pdf_file(self, file_name: str ,file_path: str, content):
# Saving the HTML file
html_file_path = f"{file_path[:-4]}.html"
self.write_txt_file(file_name=html_file_path.split('/')[-1], file_path=html_file_path, content=content)

# Convert HTML file to a PDF file
try:
options = {
'quiet': '',
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'enable-local-file-access': ''
}
config = pdfkit.configuration(wkhtmltopdf = "/usr/bin/wkhtmltopdf")
pdfkit.from_file(html_file_path, file_path, options = options, configuration = config)
self.write_to_s3(file_name, file_path)
return file_path
return f"Error write_csv_file: {err}"

except Exception as err:
raise FileNotCreatedError(file_name=file_name) from err

def write_docx_file(self, file_name: str ,file_path: str, content):
# Saving the HTML file
html_file_path = f"{file_path[:-4]}.html"
self.write_txt_file(file_name=html_file_path.split('/')[-1], file_path=html_file_path, content=content)

# Convert HTML file to a DOCx file
try:
new_parser = HtmlToDocx()
new_parser.parse_html_file(html_file_path, file_path)
self.write_to_s3(file_name, file_path)
return file_path
except Exception as err:
raise FileNotCreatedError(file_name=file_name) from err

def write_txt_file(self, file_name: str ,file_path: str, content) -> str:
try:
with open(file_path, mode="w") as file:
file.write(content)
file.close()
self.write_to_s3(file_name, file_path)
return file_path
except Exception as err:
raise FileNotCreatedError(file_name=file_name) from err

def get_agent_resource_path(self, file_name: str):
return ResourceHelper.get_agent_write_resource_path(file_name, agent=Agent.get_agent_from_id(self.session,
self.agent_id),
agent_execution=AgentExecution
.get_agent_execution_from_id(self.session,
self.agent_execution_id))

def read_file(self, file_name: str):
if self.agent_id is not None:
final_path = self.get_agent_resource_path(file_name)
Expand All @@ -151,7 +104,6 @@ def read_file(self, file_name: str):
return content
except Exception as err:
return f"Error while reading file {file_name}: {err}"

def get_files(self):
"""
Gets all file names generated by the CodingTool.
Expand All @@ -170,23 +122,3 @@ def get_files(self):
logger.error(f"Error while accessing files in {final_path}: {err}")
files = []
return files

def save_file_by_type(self, file_name: str, file_path: str, content):

# Extract the file type from the file_name
file_type = file_name.split('.')[-1].lower()

# Dictionary to map file types to corresponding functions
file_type_handlers = {
'txt': self.write_txt_file,
'pdf': self.write_pdf_file,
'docx': self.write_docx_file,
'doc': self.write_docx_file,
'csv': self.write_csv_file,
'html': self.write_txt_file
# NOTE: Add more file types and corresponding functions as needed, These functions should be defined
}

if file_type not in file_type_handlers:
raise UnsupportedFileTypeError(file_name=file_name, supported_types=list(file_type_handlers))

4 changes: 0 additions & 4 deletions superagi/tools/file/prompts/add_images_to_html.txt

This file was deleted.

7 changes: 0 additions & 7 deletions superagi/tools/file/prompts/content_to_html_prompt.txt

This file was deleted.

130 changes: 9 additions & 121 deletions superagi/tools/file/write_file.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
from typing import Type, Optional
import base64
import os

from pydantic import BaseModel, Field

from superagi.helper.prompt_reader import PromptReader
from superagi.helper.resource_helper import ResourceHelper
# from superagi.helper.s3_helper import upload_to_s3
from superagi.resource_manager.file_manager import FileManager
from superagi.tools.base_tool import BaseTool
from superagi.tools.tool_response_query_manager import ToolResponseQueryManager
from superagi.llms.base_llm import BaseLlm
from superagi.models.agent import Agent
from superagi.models.agent_execution import AgentExecution


# from superagi.helper.s3_helper import upload_to_s3


class WriteFileInput(BaseModel):
"""Input for CopyFileTool."""
file_name: str = Field(..., description="Name of the file to write. Only include the file name. Don't include path.")
content: str = Field(..., description="File content to write")


class WriteFileTool(BaseTool):
"""
Write File tool
Expand All @@ -30,13 +27,12 @@ class WriteFileTool(BaseTool):
args_schema : The args schema.
resource_manager: File resource manager.
"""
llm: Optional[BaseLlm] = None
name: str = "Write File"
args_schema: Type[BaseModel] = WriteFileInput
description: str = "Writes content in a file. The content can carry text and images."
description: str = "Writes text to a file"
agent_id: int = None
resource_manager: Optional[FileManager] = None
agent_id:int =None
agent_execution_id:int =None

class Config:
arbitrary_types_allowed = True

Expand All @@ -51,112 +47,4 @@ def _execute(self, file_name: str, content: str):
Returns:
success message if message is file written successfully or failure message if writing file fails.
"""

attached_files = self._get_attached_files()
file_type = file_name.split('.')[-1].lower()

if file_type not in ['pdf', 'docx', 'doc']:
return self.resource_manager.write_file(file_name, content)

html_code_content = self._convert_content_into_html(content=content, attached_files = attached_files,formated_for=file_type)

return self.resource_manager.write_file(file_name, html_code_content)

def _convert_content_into_html(self, content: str, attached_files: list, formated_for: str) -> str:
"""
Converts the content into an HTML file
Args:
content (str): Content to be beautified and formatted
formated_for (str): HTML content to be specifically formatted for a specific document type
Returns:
HTML Code (str): HTML code of the formated content
"""
prompt = PromptReader.read_tools_prompt(__file__, "content_to_html_prompt.txt")
prompt = prompt.replace("{content}", content)

if image_file_paths := self._get_file_path_of_images(attached_files=attached_files):
embedding_image_prompt = PromptReader.read_tools_prompt(__file__, "add_images_to_html.txt")
for idx, image_path in enumerate(image_file_paths):
embedding_image_prompt += f"\n{idx+1}. {image_path}"
prompt = prompt.replace("{embedding_image}", embedding_image_prompt)
else:
prompt = prompt.replace("{embedding_image}", "")

messages = [{"role": "system", "content": prompt}]
result = self.llm.chat_completion(messages, max_tokens=self.max_token_limit)

if formated_for == 'pdf':
return self._html_formatting_for_pdf(content=result["content"], image_list=image_file_paths)
return result['content']

def _get_file_path_of_images(self, attached_files: list):
"""
Filters Images from the attached files list and finds out the corresponding paths.
Args:
attached_files: List of names of files generated
Returns:
Full paths of the image files
"""
image_extensions, image_paths = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'],[]
for file_name in attached_files:
path = ResourceHelper().get_agent_read_resource_path(
file_name,
agent=Agent.get_agent_from_id(self.toolkit_config.session, self.agent_id),
agent_execution=AgentExecution.get_agent_execution_from_id(
self.toolkit_config.session, self.agent_execution_id
),
)
if not os.path.exists(path):
continue
_, file_extension = os.path.splitext(path)
# Check if the file extension is in the list of image extensions
if file_extension.lower() in image_extensions:
image_paths.append(path)
return image_paths


def _image_to_base64(self, image_path):
with open(image_path, "rb") as image_file:
encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
return encoded_image

def _html_formatting_for_pdf(self, content: str, image_list: list) -> str:
"""
Converts image paths into base64 inputs in an HTML file
Args:
content (str): HTML code
image_list (list): List of images to be converted and replaced
Returns:
HTML Code (str): Formatted HTML code
"""

for image_path in image_list:
content = content.replace(f"{image_path}", f"data:image/png;base64,{self._image_to_base64(image_path=image_path)}")
return content

def _get_attached_files(self):
output_directory = ResourceHelper.get_root_output_dir()
if "{agent_id}" not in output_directory:
return []
output_directory = ResourceHelper.get_formatted_agent_level_path(agent=Agent
.get_agent_from_id(session=self
.toolkit_config.session,
agent_id=self.agent_id),
path=output_directory)
agent_execution=AgentExecution.get_agent_execution_from_id(session=self.toolkit_config.session, agent_execution_id=self.agent_execution_id)
if agent_execution is not None and "{agent_execution_id}" in output_directory:
output_directory = ResourceHelper.get_formatted_agent_execution_level_path(agent_execution=agent_execution, path=output_directory)

return self._list_files(output_directory)

def _list_files(self, directory):
found_files = []
for root, dirs, files in os.walk(directory):
found_files.extend(
file
for file in files
if not file.startswith(".") and "__pycache__" not in root
)
return found_files
return self.resource_manager.write_file(file_name, content)
Loading

0 comments on commit d13c57a

Please sign in to comment.