From 525a1cda7f2956db6c4a156bf2e186dc75aa49e7 Mon Sep 17 00:00:00 2001 From: CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> Date: Fri, 10 May 2024 23:12:13 +0200 Subject: [PATCH] error capturing if repo not reachable due to lack of licensed access --- convert-hf-to-gguf-update.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index b5eb41eacdab7..56582a1402501 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -142,8 +142,17 @@ def download_file_with_auth(url, token, save_path): if tokt == TOKENIZER_TYPE.SPM: continue + # Skip if the tokenizer folder does not exist or there are other download issues previously + if not os.path.exists(f"models/tokenizers/{name}"): + logger.warning(f"Directory for tokenizer {name} not found. Skipping...") + continue + # create the tokenizer - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + try: + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + except OSError as e: + logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") + continue # Skip to the next model if the tokenizer can't be loaded chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -284,8 +293,17 @@ def get_vocab_base_pre(self, tokenizer) -> str: name = model["name"] tokt = model["tokt"] + # Skip if the tokenizer folder does not exist or there are other download issues previously + if not os.path.exists(f"models/tokenizers/{name}"): + logger.warning(f"Directory for tokenizer {name} not found. Skipping...") + continue + # create the tokenizer - tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + try: + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + except OSError as e: + logger.error(f"Failed to load tokenizer for model {name}. Error: {e}") + continue # Skip this model and continue with the next one in the loop with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f: for text in tests: