Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(deps): bump llama.cpp to '924518e2e5726e81f3aeb2518fb85963a500e… #4592

Merged
merged 1 commit into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=c05e8c9934f94fde49bc1bc9dc51eed282605150
CPPLLAMA_VERSION?=924518e2e5726e81f3aeb2518fb85963a500e93a

# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
Expand Down
42 changes: 18 additions & 24 deletions backend/cpp/llama/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ struct llama_server_context
{
llama_model *model = nullptr;
llama_context *ctx = nullptr;
const llama_vocab * vocab = nullptr;

clip_ctx *clp_ctx = nullptr;

Expand All @@ -439,6 +440,7 @@ struct llama_server_context
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
bool add_bos_token = true;
bool has_eos_token = true;

int32_t n_ctx; // total context for all clients / slots

Expand Down Expand Up @@ -502,7 +504,7 @@ struct llama_server_context

if (multimodal) {
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_n_embd(model);
const int n_embd_llm = llama_model_n_embd(model);
if (n_embd_clip != n_embd_llm) {
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
llama_free(ctx);
Expand All @@ -511,23 +513,15 @@ struct llama_server_context
}
}

vocab = llama_model_get_vocab(model);
n_ctx = llama_n_ctx(ctx);

add_bos_token = llama_add_bos_token(model);
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;

return true;
}

void validate_model_chat_template(server_params & sparams) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) {
LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
}
}

llama_client_slot* get_active_slot() {
for (llama_client_slot& slot : slots) {
// Check if the slot is currently processing
Expand Down Expand Up @@ -725,8 +719,8 @@ struct llama_server_context
slot->prompt = "";
}

if (json_value(data, "ignore_eos", false)) {
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
if (json_value(data, "ignore_eos", false) && has_eos_token) {
slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
}
/*
slot->sparams.penalty_prompt_tokens.clear();
Expand Down Expand Up @@ -765,13 +759,13 @@ struct llama_server_context
}
}
*/

slot->sparams.logit_bias.clear();

const auto &logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array())
{
const int n_vocab = llama_n_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
for (const auto &el : *logit_bias)
{
if (el.is_array() && el.size() == 2)
Expand Down Expand Up @@ -800,7 +794,7 @@ struct llama_server_context
}
else if (el[0].is_string())
{
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
for (auto tok : toks)
{
slot->sparams.logit_bias.push_back({tok, bias});
Expand Down Expand Up @@ -1130,7 +1124,7 @@ struct llama_server_context
slot.has_next_token = false;
}

if (result.tok == llama_token_eos(model))
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
{
slot.stopped_eos = true;
slot.has_next_token = false;
Expand Down Expand Up @@ -1325,7 +1319,7 @@ struct llama_server_context
res.error = false;
res.stop = true;

const int n_embd = llama_n_embd(model);
const int n_embd = llama_model_n_embd(model);
if (!params.embedding)
{
LOG_WARNING("embedding disabled", {
Expand Down Expand Up @@ -1424,7 +1418,7 @@ struct llama_server_context
n_eval = n_batch;
}

const int n_embd = llama_n_embd(model);
const int n_embd = llama_model_n_embd(model);
float * embd = img.image_embedding + i * n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
if (llama_decode(ctx, llava_batch.batch))
Expand Down Expand Up @@ -1705,11 +1699,11 @@ struct llama_server_context
suffix_tokens.erase(suffix_tokens.begin());
}

prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), llama_vocab_fim_suf(vocab));
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
prefix_tokens.push_back(llama_token_middle(model));
prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
prompt_tokens = prefix_tokens;
}
else
Expand Down
Loading