Skip to content

Commit

Permalink
model: dbrx: convert fix tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
phymbert committed Apr 7, 2024
1 parent 06a59ab commit 05833db
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1476,7 +1476,14 @@ def _set_vocab_gpt2(self):
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) # FIXME https://huggingface.co/databricks/dbrx-instruct/blob/main/tokenizer_config.json
special_vocab.merges = []
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)


Expand Down

0 comments on commit 05833db

Please sign in to comment.