Skip to content

Commit

Permalink
feat: sync llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Jan 20, 2025
1 parent 3ce570c commit aa19ec5
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 31 deletions.
30 changes: 15 additions & 15 deletions scripts/llama.cpp.patch
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
index 1d2bd932..b5007c66 100644
--- a/src/llama.cpp/common/common.h
+++ b/src/llama.cpp/common/common.h
@@ -183,6 +183,7 @@ struct common_params_vocoder {
};

struct common_params {
+ bool vocab_only = false;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
index 20be9291..1bedc55d 100644
index 451826d5..a85ac028 100644
--- a/src/llama.cpp/common/common.cpp
+++ b/src/llama.cpp/common/common.cpp
@@ -1017,6 +1017,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
@@ -1043,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
+ mparams.vocab_only = params.vocab_only;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp /common/common.h
index 3bcc637c..19ae7dad 100644
--- a/src/llama.cpp/common/common.h
+++ b/src/llama.cpp/common/common.h
@@ -189,6 +189,7 @@ struct common_params_vocoder {
};

struct common_params {
+ bool vocab_only = false;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
index 6b3641c4..6d6cb27f 100644
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
Expand Down
7 changes: 4 additions & 3 deletions src/EmbeddingWorker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ void EmbeddingWorker::Execute() {
llama_kv_cache_clear(_sess->context());
auto tokens = ::common_tokenize(_sess->context(), _text, true);
// add SEP if not present
if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
tokens.push_back(llama_token_sep(_sess->model()));
auto vocab = llama_model_get_vocab(_sess->model());
if (tokens.empty() || tokens.back() != llama_vocab_sep(vocab)) {
tokens.push_back(llama_vocab_sep(vocab));
}
const int n_embd = llama_n_embd(_sess->model());
const int n_embd = llama_model_n_embd(_sess->model());
do {
auto ctx = _sess->context();
int ret =
Expand Down
6 changes: 4 additions & 2 deletions src/LlamaCompletionWorker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ void LlamaCompletionWorker::Execute() {
size_t n_cur = 0;
size_t n_input = 0;
const auto model = _sess->model();
const bool add_bos = llama_add_bos_token(model);
auto vocab = llama_model_get_vocab(model);

const bool add_bos = llama_vocab_get_add_bos(vocab);
auto ctx = _sess->context();

auto sparams = llama_sampler_chain_default_params();
Expand Down Expand Up @@ -130,7 +132,7 @@ void LlamaCompletionWorker::Execute() {
});
}
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id)) {
if (llama_vocab_is_eog(vocab, new_token_id)) {
break;
}
// check for stop words
Expand Down
14 changes: 4 additions & 10 deletions src/LlamaContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,10 @@ Napi::Value LlamaContext::GetSystemInfo(const Napi::CallbackInfo &info) {
}

bool validateModelChatTemplate(const struct llama_model * model) {
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
std::string template_key = "tokenizer.chat_template";
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
if (res >= 0) {
llama_chat_message chat[] = {{"user", "test"}};
std::string tmpl = std::string(model_template.data(), model_template.size());
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
return chat_res > 0;
}
return res > 0;
llama_chat_message chat[] = {{"user", "test"}};
const char * tmpl = llama_model_chat_template(model);
int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}

// getModelInfo(): object
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Submodule llama.cpp updated 149 files

0 comments on commit aa19ec5

Please sign in to comment.