From aa19ec5cc31706834a346770213cb087257e538f Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Mon, 20 Jan 2025 11:34:39 +0800 Subject: [PATCH] feat: sync llama.cpp --- scripts/llama.cpp.patch | 30 +++++++++++++++--------------- src/EmbeddingWorker.cpp | 7 ++++--- src/LlamaCompletionWorker.cpp | 6 ++++-- src/LlamaContext.cpp | 14 ++++---------- src/llama.cpp | 2 +- 5 files changed, 28 insertions(+), 31 deletions(-) diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index ca29828..fc292df 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,27 +1,27 @@ -diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h -index 1d2bd932..b5007c66 100644 ---- a/src/llama.cpp/common/common.h -+++ b/src/llama.cpp/common/common.h -@@ -183,6 +183,7 @@ struct common_params_vocoder { - }; - - struct common_params { -+ bool vocab_only = false; - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 4096; // context size - int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp -index 20be9291..1bedc55d 100644 +index 451826d5..a85ac028 100644 --- a/src/llama.cpp/common/common.cpp +++ b/src/llama.cpp/common/common.cpp -@@ -1017,6 +1017,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { +@@ -1043,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.vocab_only = params.vocab_only; - mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; + mparams.tensor_split = params.tensor_split; +diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp /common/common.h +index 3bcc637c..19ae7dad 100644 +--- a/src/llama.cpp/common/common.h ++++ b/src/llama.cpp/common/common.h +@@ -189,6 +189,7 @@ struct common_params_vocoder { + }; + + struct common_params { ++ bool vocab_only = false; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 4096; // context size + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt index 6b3641c4..6d6cb27f 100644 --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt diff --git a/src/EmbeddingWorker.cpp b/src/EmbeddingWorker.cpp index 0ad8d35..86da8d2 100644 --- a/src/EmbeddingWorker.cpp +++ b/src/EmbeddingWorker.cpp @@ -9,10 +9,11 @@ void EmbeddingWorker::Execute() { llama_kv_cache_clear(_sess->context()); auto tokens = ::common_tokenize(_sess->context(), _text, true); // add SEP if not present - if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) { - tokens.push_back(llama_token_sep(_sess->model())); + auto vocab = llama_model_get_vocab(_sess->model()); + if (tokens.empty() || tokens.back() != llama_vocab_sep(vocab)) { + tokens.push_back(llama_vocab_sep(vocab)); } - const int n_embd = llama_n_embd(_sess->model()); + const int n_embd = llama_model_n_embd(_sess->model()); do { auto ctx = _sess->context(); int ret = diff --git a/src/LlamaCompletionWorker.cpp b/src/LlamaCompletionWorker.cpp index 2ff96d3..e21f310 100644 --- a/src/LlamaCompletionWorker.cpp +++ b/src/LlamaCompletionWorker.cpp @@ -59,7 +59,9 @@ void LlamaCompletionWorker::Execute() { size_t n_cur = 0; size_t n_input = 0; const auto model = _sess->model(); - const bool add_bos = llama_add_bos_token(model); + auto vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); auto ctx = _sess->context(); auto sparams = llama_sampler_chain_default_params(); @@ -130,7 +132,7 @@ void LlamaCompletionWorker::Execute() { }); } // is it an end of generation? - if (llama_token_is_eog(model, new_token_id)) { + if (llama_vocab_is_eog(vocab, new_token_id)) { break; } // check for stop words diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp index e60480a..bbcbae6 100644 --- a/src/LlamaContext.cpp +++ b/src/LlamaContext.cpp @@ -157,16 +157,10 @@ Napi::Value LlamaContext::GetSystemInfo(const Napi::CallbackInfo &info) { } bool validateModelChatTemplate(const struct llama_model * model) { - std::vector model_template(2048, 0); // longest known template is about 1200 bytes - std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - if (res >= 0) { - llama_chat_message chat[] = {{"user", "test"}}; - std::string tmpl = std::string(model_template.data(), model_template.size()); - int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); - return chat_res > 0; - } - return res > 0; + llama_chat_message chat[] = {{"user", "test"}}; + const char * tmpl = llama_model_chat_template(model); + int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); + return chat_res > 0; } // getModelInfo(): object diff --git a/src/llama.cpp b/src/llama.cpp index c05e8c9..92bc493 160000 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1 +1 @@ -Subproject commit c05e8c9934f94fde49bc1bc9dc51eed282605150 +Subproject commit 92bc493917d43b83e592349e138b54c90b1c3ea7