model: dbrx: convert fix tokenizer

phymbert · Apr 7, 2024 · 05833db · 05833db
1 parent 06a59ab
commit 05833db
Showing 1 changed file with 8 additions and 1 deletion.
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -1476,7 +1476,14 @@ def _set_vocab_gpt2(self):
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
 
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) # FIXME https://huggingface.co/databricks/dbrx-instruct/blob/main/tokenizer_config.json
+        special_vocab.merges = []
+        # only add special tokens when they were not already loaded from config.json
+        if len(special_vocab.special_token_ids) == 0:
+            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+        # this one is usually not in config.json anyway
+        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
         special_vocab.add_to_gguf(self.gguf_writer)