diff --git a/common/common.cpp b/common/common.cpp index 467fb014eedb0b..1dc68b44eaa783 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2343,15 +2343,17 @@ std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, - bool parse_special) { - return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); + bool parse_special, + bool fix_double_bos) { + return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special, fix_double_bos); } std::vector llama_tokenize( const struct llama_model * model, const std::string & text, bool add_special, - bool parse_special) { + bool parse_special, + bool fix_double_bos) { // upper limit for the number of tokens int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); @@ -2363,9 +2365,19 @@ std::vector llama_tokenize( } else { result.resize(n_tokens); } + if (fix_double_bos) { + llama_fix_double_bos(model, result); + } return result; } +void llama_fix_double_bos(const struct llama_model * model, std::vector & prompt) { + const llama_token bos = llama_token_bos(model); + if (prompt.size() >= 2 && prompt[0] == bos && prompt[1] == bos) { + prompt.erase(prompt.begin(), prompt.begin() + 1); + } +} + std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); diff --git a/common/common.h b/common/common.h index 9252a4b63889b3..03c562e74059d1 100644 --- a/common/common.h +++ b/common/common.h @@ -238,13 +238,18 @@ std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, - bool parse_special = false); + bool parse_special = false, + bool fix_dobule_bos = false); std::vector llama_tokenize( const struct llama_model * model, const std::string & text, bool add_special, - bool parse_special = false); + bool parse_special = false, + bool fix_double_bos = false); + +// if the first and the second token in the prompt are both EOS, remove the first token +void llama_fix_double_bos(const struct llama_model * model, std::vector & prompt); // tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index be30d20bf81947..090576b7bea175 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -71,7 +71,7 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(model, params.prompt, true); + tokens_list = ::llama_tokenize(model, params.prompt, true, true, true); const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 3d34378a506eba..1b3acd24fe1f9c 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -137,7 +137,7 @@ int main(int argc, char ** argv) // Tokenize the prompt : //--------------------------------- - std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); + std::vector tokens_list = llama_tokenize(ctx, params.prompt, true, true, true); const size_t max_context_size = llama_n_ctx( ctx ); const size_t max_tokens_list_size = max_context_size - 4 ; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 6a93147d70e889..47d83f63ebaf86 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -114,7 +114,7 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true, false); + auto inp = ::llama_tokenize(ctx, prompt, true, false, true); if (inp.size() > n_batch) { fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 29b5f3b3c12c83..571c78dd415a47 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -129,7 +129,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { static bool run(llama_context * ctx, const gpt_params & params) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos, false, true); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 71e7a727f19432..6a423b0da5f9c5 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -378,7 +378,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true, true, true); auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index afac145f63934c..b35b88c7983bd1 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -248,8 +248,8 @@ int main(int argc, char ** argv) { suff_rm_leading_spc = false; } std::vector embd_inp; - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false); const int space_token = 29871; if (suff_rm_leading_spc && inp_sfx[0] == space_token) { inp_sfx.erase(inp_sfx.begin()); @@ -280,10 +280,10 @@ int main(int argc, char ** argv) { if (ctx_guidance) { LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true); + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); original_prompt_len = original_inp.size(); @@ -630,8 +630,8 @@ int main(int argc, char ** argv) { suff_rm_leading_spc = false; } // tokenize new prefix and suffix - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false); if (suff_rm_leading_spc && inp_sfx[0] == space_token) { inp_sfx.erase(inp_sfx.begin()); } @@ -703,7 +703,7 @@ int main(int argc, char ** argv) { const size_t original_size = embd_inp.size(); - const auto line_inp = ::llama_tokenize(ctx, buffer, false); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, true, false); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 157a680b5ecdb0..953798c3b8819d 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -35,7 +35,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true, add_bos); eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } @@ -156,14 +156,14 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ user_prompt = prompt.substr(image_pos + std::string("").length()); LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true, true); for (int i = 0; i < (int) tmp.size(); i++) { LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true); for (int i = 0; i < (int) tmp.size(); i++) { LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } @@ -173,7 +173,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; user_prompt = prompt + "\nASSISTANT:"; if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true); for (int i = 0; i < (int) tmp.size(); i++) { LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 9c3540b2008c20..ea9a9c680065bd 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -67,7 +67,7 @@ int main(int argc, char ** argv) { std::vector inp; std::vector all; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true, true); all = inp; const int max_context_size = llama_n_ctx(ctx); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 1c230c9667c715..a99cc1e093f956 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -29,7 +29,7 @@ int main(int argc, char ** argv){ // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true, true); fprintf(stderr, "%s: tokenization done\n", __func__); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 87ecc0a4f1394e..b2bd9638ce81b4 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -34,7 +34,7 @@ int main(int argc, char ** argv){ // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true, true); llama_ngram_cache ngram_cache_context; llama_ngram_cache ngram_cache_dynamic; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index eebbd00a58e66c..d85438636f1d66 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -42,7 +42,7 @@ int main(int argc, char ** argv){ // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true, true); llama_ngram_cache ngram_cache_context; llama_ngram_cache ngram_cache_dynamic; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index eabbc2db382861..c819c023072be1 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -255,7 +255,7 @@ int main(int argc, char ** argv) { if (params.chatml) { params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; } - embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); + embd_inp = ::llama_tokenize(ctx, params.prompt, true, true, true); } else { LOG("use session tokens\n"); embd_inp = session_tokens; @@ -277,10 +277,10 @@ int main(int argc, char ** argv) { if (ctx_guidance) { LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); original_prompt_len = original_inp.size(); @@ -339,15 +339,15 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true, false); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true, false); LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); // chatml prefix & suffix - const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true); - const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); + const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true, false); + const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true, false); LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str()); @@ -418,7 +418,7 @@ int main(int argc, char ** argv) { for (const auto & antiprompt : params.antiprompt) { LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); + auto tmp = ::llama_tokenize(ctx, antiprompt, false, true, false); for (int i = 0; i < (int) tmp.size(); i++) { LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } @@ -433,7 +433,7 @@ int main(int argc, char ** argv) { if (!params.input_prefix.empty()) { LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); + auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true, true); for (int i = 0; i < (int) tmp.size(); i++) { LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } @@ -443,7 +443,7 @@ int main(int argc, char ** argv) { if (!params.input_suffix.empty()) { LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); + auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true, false); for (int i = 0; i < (int) tmp.size(); i++) { LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } @@ -516,7 +516,7 @@ int main(int argc, char ** argv) { antiprompt_ids.reserve(params.antiprompt.size()); for (const std::string & antiprompt : params.antiprompt) { - antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); + antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true, false)); } struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); @@ -801,7 +801,7 @@ int main(int argc, char ** argv) { if (params.interactive) { if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true, false); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); is_antiprompt = true; } @@ -875,9 +875,9 @@ int main(int argc, char ** argv) { process_escapes(buffer); } - const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); - const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, false, false); + const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7c5595d6edb2dc..7cdad59b92b189 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -164,7 +164,7 @@ int main(int argc, char ** argv) { } std::vector tokens_system; - tokens_system = ::llama_tokenize(ctx, k_system, true); + tokens_system = ::llama_tokenize(ctx, k_system, true, true, true); const int32_t n_tokens_system = tokens_system.size(); llama_seq_id g_seq_id = 0; @@ -256,7 +256,7 @@ int main(int argc, char ** argv) { // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; - tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); + tokens_prompt = ::llama_tokenize(ctx, client.prompt, false, true, false); for (size_t i = 0; i < tokens_prompt.size(); ++i) { llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index f2ef9ca10d4a27..c0abec73cead80 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -108,10 +108,10 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); + tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true); // tokenize the prefix and use it as a sink - const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); + const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true, true, true).size(); const int n_tokens_all = tokens_list.size(); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index db6e0949d4c47a..8c4801681c2ad4 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -345,7 +345,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true, true, true); const int n_ctx = llama_n_ctx(ctx); @@ -498,7 +498,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true, true, true); auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -843,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j = 0; j < 4; j++) { hs_cur.ending[j] = prompt_lines[idx*6+2+j]; - hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); + hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true, true, true); } // determine the common prefix of the endings @@ -1136,8 +1136,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); for (auto & task : data) { - task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); - task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); + task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true, true, true); + task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true, true, true); task.common_prefix = 0; for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { @@ -1152,8 +1152,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[1].size() - task.common_prefix; - task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); - task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); + task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true, true, true).size(); + task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true, true, true).size(); } fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); @@ -1359,7 +1359,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic } return false; } - task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); + task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true, true, true)); } auto min_len = task.seq_tokens.front().size(); for (auto& seq : task.seq_tokens) { diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index c3b766882dbec7..e61d8b66f39353 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -37,7 +37,7 @@ int main(int argc, char ** argv) { } // tokenize prompt - auto tokens = llama_tokenize(ctx, params.prompt, true); + auto tokens = llama_tokenize(ctx, params.prompt, true, true, true); // evaluate prompt llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ff0814b2f28bfb..02b6f616c7f6ec 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -765,6 +765,9 @@ struct server_context { // but it's better compared to completely ignoring ChatML and other chat templates const bool TMP_FORCE_SPECIAL = true; + // If special tokens are added, also make sure that this doesn't cause 2 BOS tokens if the user also adds one: + const bool fix_double_bos = add_special; + // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector prompt_tokens; @@ -777,7 +780,7 @@ struct server_context { std::vector p; if (first) { - p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL, fix_double_bos); first = false; } else { p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); @@ -794,7 +797,7 @@ struct server_context { } } else { auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL, fix_double_bos); } return prompt_tokens; @@ -1058,7 +1061,7 @@ struct server_context { system_tokens.clear(); if (!system_prompt.empty()) { - system_tokens = ::llama_tokenize(ctx, system_prompt, true); + system_tokens = ::llama_tokenize(ctx, system_prompt, true, false, true); llama_batch_clear(batch); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index b0f8e0fdc49873..ce394ff2719ebe 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -66,7 +66,7 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); + tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true); const int n_ctx = llama_n_ctx(ctx); const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 12e46fbc91a242..413e83171c6c5b 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -128,7 +128,7 @@ int main(int argc, char ** argv) { // Tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); + inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 8b1baea800cc86..5b0a63dc6e87c5 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -28,7 +28,7 @@ int main(int argc, char ** argv) { std::vector tokens; - tokens = ::llama_tokenize(model, prompt, true, true); + tokens = ::llama_tokenize(model, prompt, true, true, true); for (int i = 0; i < (int) tokens.size(); i++) { if (printing_ids) {