Skip to content

Commit

Permalink
Store an MD5 hash of uploaded/indexed file and check before prepdocs (A…
Browse files Browse the repository at this point in the history
…zure-Samples#835)

* Hash the uploaded files locally and skip them if you provision a second time and they haven't changed

* Overwrite the hash when it changes

* Remove open mode parameter

* fix f-string

* reformat changes

* Update prepdocs.py
  • Loading branch information
tonybaloney authored Oct 22, 2023
1 parent d37063f commit cf215ab
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 1 deletion.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -144,4 +144,6 @@ cython_debug/
# NPM
npm-debug.log*
node_modules
static/
static/

data/*.md5
21 changes: 21 additions & 0 deletions scripts/prepdocs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import base64
import glob
import hashlib
import html
import io
import os
Expand Down Expand Up @@ -515,6 +516,26 @@ def read_files(
read_files(filename + "/*", use_vectors, vectors_batch_support)
continue
try:
# if filename ends in .md5 skip
if filename.endswith(".md5"):
continue

# if there is a file called .md5 in this directory, see if its updated
stored_hash = None
with open(filename, "rb") as file:
existing_hash = hashlib.md5(file.read()).hexdigest()
if os.path.exists(filename + ".md5"):
with open(filename + ".md5", encoding="utf-8") as md5_f:
stored_hash = md5_f.read()

if stored_hash and stored_hash.strip() == existing_hash.strip():
print(f"Skipping {filename}, no changes detected.")
continue
else:
# Write the hash
with open(filename + ".md5", "w", encoding="utf-8") as md5_f:
md5_f.write(existing_hash)

if not args.skipblobs:
upload_blobs(filename)
page_map = get_document_text(filename)
Expand Down

0 comments on commit cf215ab

Please sign in to comment.