Skip to content

Commit

Permalink
Merge pull request #917 from arc53/multiple-uploads
Browse files Browse the repository at this point in the history
Multiple file upload
  • Loading branch information
dartpain authored Apr 9, 2024
2 parents 968a116 + 7a02df5 commit 7d2b8cb
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 27 deletions.
56 changes: 35 additions & 21 deletions application/api/user/routes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import uuid
import shutil
from flask import Blueprint, request, jsonify
from urllib.parse import urlparse
import requests
Expand Down Expand Up @@ -136,30 +137,43 @@ def upload_file():
return {"status": "no name"}
job_name = secure_filename(request.form["name"])
# check if the post request has the file part
if "file" not in request.files:
print("No file part")
return {"status": "no file"}
file = request.files["file"]
if file.filename == "":
files = request.files.getlist("file")

if not files or all(file.filename == '' for file in files):
return {"status": "no file name"}

if file:
filename = secure_filename(file.filename)
# save dir
save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
# create dir if not exists
if not os.path.exists(save_dir):
os.makedirs(save_dir)

file.save(os.path.join(save_dir, filename))
task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx",
".csv", ".epub", ".html", ".mdx"],
job_name, filename, user)
# task id
task_id = task.id
return {"status": "ok", "task_id": task_id}
# Directory where files will be saved
save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name)
os.makedirs(save_dir, exist_ok=True)

if len(files) > 1:
# Multiple files; prepare them for zip
temp_dir = os.path.join(save_dir, "temp")
os.makedirs(temp_dir, exist_ok=True)

for file in files:
filename = secure_filename(file.filename)
file.save(os.path.join(temp_dir, filename))

# Use shutil.make_archive to zip the temp directory
zip_path = shutil.make_archive(base_name=os.path.join(save_dir, job_name), format='zip', root_dir=temp_dir)
final_filename = os.path.basename(zip_path)

# Clean up the temporary directory after zipping
shutil.rmtree(temp_dir)
else:
return {"status": "error"}
# Single file
file = files[0]
final_filename = secure_filename(file.filename)
file_path = os.path.join(save_dir, final_filename)
file.save(file_path)

# Call ingest with the single file or zipped file
task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx",
".csv", ".epub", ".html", ".mdx"],
job_name, final_filename, user)

return {"status": "ok", "task_id": task.id}

@user.route("/api/remote", methods=["POST"])
def upload_remote():
Expand Down
36 changes: 31 additions & 5 deletions application/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,32 @@ def generate_random_string(length):
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)

def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5):
"""
Recursively extract zip files with a limit on recursion depth.
Args:
zip_path (str): Path to the zip file to be extracted.
extract_to (str): Destination path for extracted files.
current_depth (int): Current depth of recursion.
max_depth (int): Maximum allowed depth of recursion to prevent infinite loops.
"""
if current_depth > max_depth:
print(f"Reached maximum recursion depth of {max_depth}")
return

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
os.remove(zip_path) # Remove the zip file after extracting

# Check for nested zip files and extract them
for root, dirs, files in os.walk(extract_to):
for file in files:
if file.endswith(".zip"):
# If a nested zip file is found, extract it recursively
file_path = os.path.join(root, file)
extract_zip_recursive(file_path, root, current_depth + 1, max_depth)


# Define the main function for ingesting and processing documents.
def ingest_worker(self, directory, formats, name_job, filename, user):
Expand Down Expand Up @@ -66,9 +92,11 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
token_check = True
min_tokens = 150
max_tokens = 1250
full_path = directory + "/" + user + "/" + name_job
recursion_depth = 2
full_path = os.path.join(directory, user, name_job)
import sys


print(full_path, file=sys.stderr)
# check if API_URL env variable is set
file_data = {"name": name_job, "file": filename, "user": user}
Expand All @@ -81,14 +109,12 @@ def ingest_worker(self, directory, formats, name_job, filename, user):

if not os.path.exists(full_path):
os.makedirs(full_path)
with open(full_path + "/" + filename, "wb") as f:
with open(os.path.join(full_path, filename), "wb") as f:
f.write(file)

# check if file is .zip and extract it
if filename.endswith(".zip"):
with zipfile.ZipFile(full_path + "/" + filename, "r") as zip_ref:
zip_ref.extractall(full_path)
os.remove(full_path + "/" + filename)
extract_zip_recursive(os.path.join(full_path, filename), full_path, 0, recursion_depth)

self.update_state(state="PROGRESS", meta={"current": 1})

Expand Down
2 changes: 1 addition & 1 deletion frontend/src/upload/Upload.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ export default function Upload({

const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop,
multiple: false,
multiple: true,
onDragEnter: doNothing,
onDragOver: doNothing,
onDragLeave: doNothing,
Expand Down

0 comments on commit 7d2b8cb

Please sign in to comment.