From 05833db5f64429eab25574067bb724d1bda3915a Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 7 Apr 2024 04:23:32 +0200 Subject: [PATCH] model: dbrx: convert fix tokenizer --- convert-hf-to-gguf.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6bbe8a0a618f8..a518946a0742d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1476,7 +1476,14 @@ def _set_vocab_gpt2(self): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) # FIXME https://huggingface.co/databricks/dbrx-instruct/blob/main/tokenizer_config.json + special_vocab.merges = [] + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer)