From c21d5e13fef3562bf364f343e9a2c7533410f514 Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Fri, 10 May 2024 15:24:35 +0800 Subject: [PATCH 1/8] fix: llama-3 ignore_merges --- llama.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llama.cpp b/llama.cpp index 2f1123d4e1678..b73281d03832b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12292,6 +12292,20 @@ struct llm_tokenizer_bpe { symbols_final.clear(); for (auto & word : word_collection) { + if (vocab.token_to_id.find(word) != vocab.token_to_id.end()) { + llm_symbol sym; + sym.text = word.c_str(); + sym.n = word.size(); + sym.prev = final_prev_index; + sym.next = -1; + if (final_prev_index != -1) { + symbols_final[final_prev_index].next = symbols_final.size(); + } + symbols_final.emplace_back(sym); + final_prev_index = symbols_final.size() - 1; + continue; + } + work_queue = llm_bigram_bpe::queue(); symbols.clear(); From c7614930f3cafa14602b5c6d3eb9a8fc4a6a3b29 Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Fri, 10 May 2024 15:49:31 +0800 Subject: [PATCH 2/8] test: add test for llama-3 bpe ignore_merges --- tests/test-tokenizer-1-bpe.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index a0e2caf9427eb..015d6a2e41a5a 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -65,7 +65,18 @@ int main(int argc, char **argv) { std::string str = llama_detokenize_bpe(ctx, std::vector(1, i)); try { auto cps = unicode_cpts_from_utf8(str); - std::vector tokens = llama_tokenize(ctx, str, false); + std::vector tokens = llama_tokenize(ctx, str, false, true); + if (tokens.size() > 1) { + fprintf(stderr, + "%s : error: token %d detokenizes to '%s'(%zu) but " + "tokenization of this to multiple tokens: [", + __func__, i, str.c_str(), str.length()); + fprintf(stderr, "%d", tokens[0]); + for (size_t i = 1; i < tokens.size(); i++) { + fprintf(stderr, ", %d", tokens[i]); + } + fprintf(stderr, "]\n"); + } std::string check = llama_detokenize_bpe(ctx, tokens); if (check != str) { fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n", From 8a51d3b12dfe645e217260c36896708bd297171e Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Fri, 10 May 2024 16:12:20 +0800 Subject: [PATCH 3/8] fix: set ignore_merges only for llama-3 --- llama.cpp | 4 +++- tests/test-tokenizer-1-bpe.cpp | 19 ++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index b73281d03832b..af005c12c70eb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12200,12 +12200,14 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; + int ignore_merges = false; std::vector word_collection; switch (vocab.type) { case LLAMA_VOCAB_TYPE_BPE: switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + ignore_merges = true; case LLAMA_VOCAB_PRE_TYPE_DBRX: word_collection = unicode_regex_split(text, { // original regex from tokenizer.json @@ -12292,7 +12294,7 @@ struct llm_tokenizer_bpe { symbols_final.clear(); for (auto & word : word_collection) { - if (vocab.token_to_id.find(word) != vocab.token_to_id.end()) { + if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { llm_symbol sym; sym.text = word.c_str(); sym.n = word.size(); diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 015d6a2e41a5a..63e261d8db9c2 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -13,15 +13,27 @@ #include int main(int argc, char **argv) { - if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); + if (argc < 2 || argc > 3) { + fprintf(stderr, "Usage: %s [--ignore-merges]\n", argv[0]); return 1; } const std::string fname = argv[1]; + bool ignore_merges = false; + if (argc == 3) { + if (std::strcmp(argv[2], "ignore-merges") != 0) { + fprintf(stderr, "Usage: %s [--ignore-merges]\n", argv[0]); + return 1; + } + ignore_merges = true; + } fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + if (ignore_merges) { + fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__); + } + llama_model * model; llama_context * ctx; @@ -66,7 +78,7 @@ int main(int argc, char **argv) { try { auto cps = unicode_cpts_from_utf8(str); std::vector tokens = llama_tokenize(ctx, str, false, true); - if (tokens.size() > 1) { + if (ignore_merges && tokens.size() > 1) { fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but " "tokenization of this to multiple tokens: [", @@ -76,6 +88,7 @@ int main(int argc, char **argv) { fprintf(stderr, ", %d", tokens[i]); } fprintf(stderr, "]\n"); + return 2; } std::string check = llama_detokenize_bpe(ctx, tokens); if (check != str) { From 5d30a6ddd07ec28b3d53ebf41364a755fe4e6526 Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Fri, 10 May 2024 16:31:01 +0800 Subject: [PATCH 4/8] fix: test-tokenizer-1-bpe --ingore-merges detection --- tests/test-tokenizer-1-bpe.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 63e261d8db9c2..209a04ad6f77a 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -21,7 +21,7 @@ int main(int argc, char **argv) { const std::string fname = argv[1]; bool ignore_merges = false; if (argc == 3) { - if (std::strcmp(argv[2], "ignore-merges") != 0) { + if (std::strcmp(argv[2], "--ignore-merges") != 0) { fprintf(stderr, "Usage: %s [--ignore-merges]\n", argv[0]); return 1; } From 1fb5b55894359ef1ac88266157d209deb77cb370 Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Fri, 10 May 2024 18:18:01 +0800 Subject: [PATCH 5/8] fix: copy to fix fallthrough --- llama.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index af005c12c70eb..152d15e56c10c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12208,7 +12208,6 @@ struct llm_tokenizer_bpe { switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: ignore_merges = true; - case LLAMA_VOCAB_PRE_TYPE_DBRX: word_collection = unicode_regex_split(text, { // original regex from tokenizer.json //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", @@ -12217,6 +12216,12 @@ struct llm_tokenizer_bpe { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); break; + case LLAMA_VOCAB_PRE_TYPE_DBRX: + word_collection = unicode_regex_split(text, { + // same as llama3 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: word_collection = unicode_regex_split(text, { "[\r\n]", From c3d0f41d50425e8662b3bab874c9a994620aa2c4 Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Fri, 10 May 2024 19:19:29 +0800 Subject: [PATCH 6/8] fix: change ignore_merges to bool --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 152d15e56c10c..554b34a8e7383 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12200,7 +12200,7 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - int ignore_merges = false; + bool ignore_merges = false; std::vector word_collection; switch (vocab.type) { From 0c9a0aef4c81def7b37ef28c02c50f7762f5b67d Mon Sep 17 00:00:00 2001 From: Tony Fettes Date: Fri, 10 May 2024 21:26:07 +0800 Subject: [PATCH 7/8] fix: add ignore merges tests to cmake --- models/ggml-vocab-llama-bpe.gguf.inp | 2 ++ models/ggml-vocab-llama-bpe.gguf.out | 1 + tests/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 0a89107c60d7f..9380bf355202a 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -104,3 +104,5 @@ __ggml_vocab_test__ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL __ggml_vocab_test__ + Việt +__ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 1f00e3812e227..1f3607fb6a378 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -41,3 +41,4 @@ 8765 8765 1644 8765 8765 8765 198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43 + 101798 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d409a1d6b42ec..766a017524237 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common) install(TARGETS test-tokenizer-1-bpe RUNTIME) # TODO: disabled due to slowness -#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) From b8d3cd5337bfa74f816138af84e7181c5208f717 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 11 May 2024 11:10:23 +0300 Subject: [PATCH 8/8] llama : alternative merge ignore logic --- llama.cpp | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/llama.cpp b/llama.cpp index 554b34a8e7383..a7e27e4b0512e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12299,26 +12299,17 @@ struct llm_tokenizer_bpe { symbols_final.clear(); for (auto & word : word_collection) { - if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { - llm_symbol sym; - sym.text = word.c_str(); - sym.n = word.size(); - sym.prev = final_prev_index; - sym.next = -1; - if (final_prev_index != -1) { - symbols_final[final_prev_index].next = symbols_final.size(); - } - symbols_final.emplace_back(sym); - final_prev_index = symbols_final.size() - 1; - continue; - } - work_queue = llm_bigram_bpe::queue(); symbols.clear(); int index = 0; size_t offset = 0; + if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { + symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); + offset = word.size(); + } + while (offset < word.size()) { llm_symbol sym; size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));