From 506fb3ac35d371b7c0c8cd46d97c8e3ac62fe702 Mon Sep 17 00:00:00 2001 From: Anthony Shaw Date: Fri, 20 Oct 2023 13:00:53 +1100 Subject: [PATCH 1/6] Hash the uploaded files locally and skip them if you provision a second time and they haven't changed --- .gitignore | 4 +++- scripts/prepdocs.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 02031bcda7..818ff1af9c 100644 --- a/.gitignore +++ b/.gitignore @@ -144,4 +144,6 @@ cython_debug/ # NPM npm-debug.log* node_modules -static/ \ No newline at end of file +static/ + +data/*.md5 \ No newline at end of file diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index 90c8001551..27668ada62 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -1,6 +1,7 @@ import argparse import base64 import glob +import hashlib import html import io import os @@ -515,6 +516,26 @@ def read_files( read_files(filename + "/*", use_vectors, vectors_batch_support) continue try: + # if filename ends in .md5 skip + if filename.endswith('.md5'): + continue + + # if there is a file called .md5 in this directory, see if its updated + stored_hash = None + with open(filename, 'rb') as file: + existing_hash = hashlib.md5(file.read()).hexdigest() + if os.path.exists(filename + ".md5"): + with open(filename + ".md5", "r", encoding="utf-8") as md5_f: + stored_hash = md5_f.read() + else: + # Write the hash + with open(filename + ".md5", "w", encoding="utf-8") as md5_f: + md5_f.write(existing_hash) + + if stored_hash and stored_hash.strip() == existing_hash.strip(): + print("Skipping {filename}, no changes detected") + continue + if not args.skipblobs: upload_blobs(filename) page_map = get_document_text(filename) From 21aecef7ec3fab856e553b35070928182a6c4830 Mon Sep 17 00:00:00 2001 From: Anthony Shaw Date: Fri, 20 Oct 2023 13:04:23 +1100 Subject: [PATCH 2/6] Overwrite the hash when it changes --- scripts/prepdocs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index 27668ada62..44040a3d43 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -527,14 +527,14 @@ def read_files( if os.path.exists(filename + ".md5"): with open(filename + ".md5", "r", encoding="utf-8") as md5_f: stored_hash = md5_f.read() - else: - # Write the hash - with open(filename + ".md5", "w", encoding="utf-8") as md5_f: - md5_f.write(existing_hash) if stored_hash and stored_hash.strip() == existing_hash.strip(): print("Skipping {filename}, no changes detected") continue + else: + # Write the hash + with open(filename + ".md5", "w", encoding="utf-8") as md5_f: + md5_f.write(existing_hash) if not args.skipblobs: upload_blobs(filename) From 479d5b73386257f212421236b9dd1c997cf10745 Mon Sep 17 00:00:00 2001 From: Anthony Shaw Date: Fri, 20 Oct 2023 13:10:59 +1100 Subject: [PATCH 3/6] Remove open mode parameter --- scripts/prepdocs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index 44040a3d43..58a2f25937 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -525,7 +525,7 @@ def read_files( with open(filename, 'rb') as file: existing_hash = hashlib.md5(file.read()).hexdigest() if os.path.exists(filename + ".md5"): - with open(filename + ".md5", "r", encoding="utf-8") as md5_f: + with open(filename + ".md5", encoding="utf-8") as md5_f: stored_hash = md5_f.read() if stored_hash and stored_hash.strip() == existing_hash.strip(): From 29a7157db60837478d0795abde7b3be0066b0dff Mon Sep 17 00:00:00 2001 From: Anthony Shaw Date: Fri, 20 Oct 2023 13:31:18 +1100 Subject: [PATCH 4/6] fix f-string --- scripts/prepdocs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index 58a2f25937..2dda08a855 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -518,6 +518,7 @@ def read_files( try: # if filename ends in .md5 skip if filename.endswith('.md5'): + print("Skipping md5 hash index.") continue # if there is a file called .md5 in this directory, see if its updated @@ -529,7 +530,7 @@ def read_files( stored_hash = md5_f.read() if stored_hash and stored_hash.strip() == existing_hash.strip(): - print("Skipping {filename}, no changes detected") + print(f"Skipping {filename}, no changes detected.") continue else: # Write the hash From e41d5120df68952015b251c391b385cde7e0cba7 Mon Sep 17 00:00:00 2001 From: Anthony Shaw Date: Fri, 20 Oct 2023 13:33:28 +1100 Subject: [PATCH 5/6] reformat changes --- scripts/prepdocs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index 2dda08a855..6e64122e7d 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -517,13 +517,13 @@ def read_files( continue try: # if filename ends in .md5 skip - if filename.endswith('.md5'): + if filename.endswith(".md5"): print("Skipping md5 hash index.") continue # if there is a file called .md5 in this directory, see if its updated stored_hash = None - with open(filename, 'rb') as file: + with open(filename, "rb") as file: existing_hash = hashlib.md5(file.read()).hexdigest() if os.path.exists(filename + ".md5"): with open(filename + ".md5", encoding="utf-8") as md5_f: From aeb75f776b1e230739f4f9268c3c731e4565cd27 Mon Sep 17 00:00:00 2001 From: Anthony Shaw Date: Fri, 20 Oct 2023 14:38:49 +1100 Subject: [PATCH 6/6] Update prepdocs.py --- scripts/prepdocs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index 6e64122e7d..7edcb5a50c 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -518,7 +518,6 @@ def read_files( try: # if filename ends in .md5 skip if filename.endswith(".md5"): - print("Skipping md5 hash index.") continue # if there is a file called .md5 in this directory, see if its updated