From c21d5e13fef3562bf364f343e9a2c7533410f514 Mon Sep 17 00:00:00 2001
From: Haoxiang Fei <feihaoxiang@idea.edu.cn>
Date: Fri, 10 May 2024 15:24:35 +0800
Subject: [PATCH 1/8] fix: llama-3 ignore_merges

---
 llama.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
diff --git a/llama.cpp b/llama.cpp
index 2f1123d4e1678..b73281d03832b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12292,6 +12292,20 @@ struct llm_tokenizer_bpe {
         symbols_final.clear();
 
         for (auto & word : word_collection) {
+            if (vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+                llm_symbol sym;
+                sym.text = word.c_str();
+                sym.n = word.size();
+                sym.prev = final_prev_index;
+                sym.next = -1;
+                if (final_prev_index != -1) {
+                    symbols_final[final_prev_index].next = symbols_final.size();
+                }
+                symbols_final.emplace_back(sym);
+                final_prev_index = symbols_final.size() - 1;
+                continue;
+            }
+
             work_queue = llm_bigram_bpe::queue();
             symbols.clear();
 

From c7614930f3cafa14602b5c6d3eb9a8fc4a6a3b29 Mon Sep 17 00:00:00 2001
From: Haoxiang Fei <feihaoxiang@idea.edu.cn>
Date: Fri, 10 May 2024 15:49:31 +0800
Subject: [PATCH 2/8] test: add test for llama-3 bpe ignore_merges

---
 tests/test-tokenizer-1-bpe.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index a0e2caf9427eb..015d6a2e41a5a 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -65,7 +65,18 @@ int main(int argc, char **argv) {
         std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
         try {
             auto cps = unicode_cpts_from_utf8(str);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
+            if (tokens.size() > 1) {
+                fprintf(stderr,
+                        "%s : error: token %d detokenizes to '%s'(%zu) but "
+                        "tokenization of this to multiple tokens: [",
+                        __func__, i, str.c_str(), str.length());
+                fprintf(stderr, "%d", tokens[0]);
+                for (size_t i = 1; i < tokens.size(); i++) {
+                    fprintf(stderr, ", %d", tokens[i]);
+                }
+                fprintf(stderr, "]\n");
+            }
             std::string check = llama_detokenize_bpe(ctx, tokens);
             if (check != str) {
                 fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",

From 8a51d3b12dfe645e217260c36896708bd297171e Mon Sep 17 00:00:00 2001
From: Haoxiang Fei <feihaoxiang@idea.edu.cn>
Date: Fri, 10 May 2024 16:12:20 +0800
Subject: [PATCH 3/8] fix: set ignore_merges only for llama-3

---
 llama.cpp                      |  4 +++-
 tests/test-tokenizer-1-bpe.cpp | 19 ++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b73281d03832b..af005c12c70eb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12200,12 +12200,14 @@ struct llm_tokenizer_bpe {
 
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         int final_prev_index = -1;
+        int ignore_merges = false;
 
         std::vector<std::string> word_collection;
         switch (vocab.type) {
             case LLAMA_VOCAB_TYPE_BPE:
                 switch (vocab.type_pre) {
                     case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+                        ignore_merges = true;
                     case LLAMA_VOCAB_PRE_TYPE_DBRX:
                         word_collection = unicode_regex_split(text, {
                             // original regex from tokenizer.json
@@ -12292,7 +12294,7 @@ struct llm_tokenizer_bpe {
         symbols_final.clear();
 
         for (auto & word : word_collection) {
-            if (vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+            if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
                 llm_symbol sym;
                 sym.text = word.c_str();
                 sym.n = word.size();
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 015d6a2e41a5a..63e261d8db9c2 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -13,15 +13,27 @@
 #include <vector>
 
 int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+    if (argc < 2 || argc > 3) {
+        fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
         return 1;
     }
 
     const std::string fname = argv[1];
+    bool ignore_merges = false;
+    if (argc == 3) {
+        if (std::strcmp(argv[2], "ignore-merges") != 0) {
+            fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
+            return 1;
+        }
+        ignore_merges = true;
+    }
 
     fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
 
+    if (ignore_merges) {
+        fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
+    }
+
     llama_model * model;
     llama_context * ctx;
 
@@ -66,7 +78,7 @@ int main(int argc, char **argv) {
         try {
             auto cps = unicode_cpts_from_utf8(str);
             std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-            if (tokens.size() > 1) {
+            if (ignore_merges && tokens.size() > 1) {
                 fprintf(stderr,
                         "%s : error: token %d detokenizes to '%s'(%zu) but "
                         "tokenization of this to multiple tokens: [",
@@ -76,6 +88,7 @@ int main(int argc, char **argv) {
                     fprintf(stderr, ", %d", tokens[i]);
                 }
                 fprintf(stderr, "]\n");
+                return 2;
             }
             std::string check = llama_detokenize_bpe(ctx, tokens);
             if (check != str) {

From 5d30a6ddd07ec28b3d53ebf41364a755fe4e6526 Mon Sep 17 00:00:00 2001
From: Haoxiang Fei <feihaoxiang@idea.edu.cn>
Date: Fri, 10 May 2024 16:31:01 +0800
Subject: [PATCH 4/8] fix: test-tokenizer-1-bpe --ingore-merges detection

---
 tests/test-tokenizer-1-bpe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 63e261d8db9c2..209a04ad6f77a 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -21,7 +21,7 @@ int main(int argc, char **argv) {
     const std::string fname = argv[1];
     bool ignore_merges = false;
     if (argc == 3) {
-        if (std::strcmp(argv[2], "ignore-merges") != 0) {
+        if (std::strcmp(argv[2], "--ignore-merges") != 0) {
             fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
             return 1;
         }

From 1fb5b55894359ef1ac88266157d209deb77cb370 Mon Sep 17 00:00:00 2001
From: Haoxiang Fei <feihaoxiang@idea.edu.cn>
Date: Fri, 10 May 2024 18:18:01 +0800
Subject: [PATCH 5/8] fix: copy to fix fallthrough

---
 llama.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index af005c12c70eb..152d15e56c10c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12208,7 +12208,6 @@ struct llm_tokenizer_bpe {
                 switch (vocab.type_pre) {
                     case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
                         ignore_merges = true;
-                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
                         word_collection = unicode_regex_split(text, {
                             // original regex from tokenizer.json
                             //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12217,6 +12216,12 @@ struct llm_tokenizer_bpe {
                             "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                         });
                         break;
+                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
+                        word_collection = unicode_regex_split(text, {
+                            // same as llama3
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                        });
+                        break;
                     case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                         word_collection = unicode_regex_split(text, {
                             "[\r\n]",

From c3d0f41d50425e8662b3bab874c9a994620aa2c4 Mon Sep 17 00:00:00 2001
From: Haoxiang Fei <feihaoxiang@idea.edu.cn>
Date: Fri, 10 May 2024 19:19:29 +0800
Subject: [PATCH 6/8] fix: change ignore_merges to bool

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 152d15e56c10c..554b34a8e7383 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12200,7 +12200,7 @@ struct llm_tokenizer_bpe {
 
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         int final_prev_index = -1;
-        int ignore_merges = false;
+        bool ignore_merges = false;
 
         std::vector<std::string> word_collection;
         switch (vocab.type) {

From 0c9a0aef4c81def7b37ef28c02c50f7762f5b67d Mon Sep 17 00:00:00 2001
From: Tony Fettes <tonyfettes@tonyfettes.com>
Date: Fri, 10 May 2024 21:26:07 +0800
Subject: [PATCH 7/8] fix: add ignore merges tests to cmake

---
 models/ggml-vocab-llama-bpe.gguf.inp | 2 ++
 models/ggml-vocab-llama-bpe.gguf.out | 1 +
 tests/CMakeLists.txt                 | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp
index 0a89107c60d7f..9380bf355202a 100644
--- a/models/ggml-vocab-llama-bpe.gguf.inp
+++ b/models/ggml-vocab-llama-bpe.gguf.inp
@@ -104,3 +104,5 @@ __ggml_vocab_test__
      
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
+ Việt
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out
index 1f00e3812e227..1f3607fb6a378 100644
--- a/models/ggml-vocab-llama-bpe.gguf.out
+++ b/models/ggml-vocab-llama-bpe.gguf.out
@@ -41,3 +41,4 @@
  8765 8765 1644
  8765 8765 8765
  198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
+ 101798
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d409a1d6b42ec..766a017524237 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
 install(TARGETS test-tokenizer-1-bpe RUNTIME)
 
 # TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)

From b8d3cd5337bfa74f816138af84e7181c5208f717 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 11 May 2024 11:10:23 +0300
Subject: [PATCH 8/8] llama : alternative merge ignore logic

---
 llama.cpp | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 554b34a8e7383..a7e27e4b0512e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12299,26 +12299,17 @@ struct llm_tokenizer_bpe {
         symbols_final.clear();
 
         for (auto & word : word_collection) {
-            if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
-                llm_symbol sym;
-                sym.text = word.c_str();
-                sym.n = word.size();
-                sym.prev = final_prev_index;
-                sym.next = -1;
-                if (final_prev_index != -1) {
-                    symbols_final[final_prev_index].next = symbols_final.size();
-                }
-                symbols_final.emplace_back(sym);
-                final_prev_index = symbols_final.size() - 1;
-                continue;
-            }
-
             work_queue = llm_bigram_bpe::queue();
             symbols.clear();
 
             int index = 0;
             size_t offset = 0;
 
+            if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+                offset = word.size();
+            }
+
             while (offset < word.size()) {
                 llm_symbol sym;
                 size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));