Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tokenization: no double BOS tokens #7107

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2343,15 +2343,17 @@ std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special) {
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
bool parse_special,
bool fix_double_bos) {
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special, fix_double_bos);
}

std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_special,
bool parse_special) {
bool parse_special,
bool fix_double_bos) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
Expand All @@ -2363,9 +2365,19 @@ std::vector<llama_token> llama_tokenize(
} else {
result.resize(n_tokens);
}
if (fix_double_bos) {
llama_fix_double_bos(model, result);
}
return result;
}

void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt) {
const llama_token bos = llama_token_bos(model);
if (prompt.size() >= 2 && prompt[0] == bos && prompt[1] == bos) {
prompt.erase(prompt.begin(), prompt.begin() + 1);
}
}

std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
Expand Down
9 changes: 7 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,18 @@ std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special = false);
bool parse_special = false,
bool fix_dobule_bos = false);

std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_special,
bool parse_special = false);
bool parse_special = false,
bool fix_double_bos = false);

// if the first and the second token in the prompt are both EOS, remove the first token
void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt);

// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
Expand Down
2 changes: 1 addition & 1 deletion examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ int main(int argc, char ** argv) {
// tokenize the prompt

std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true);
tokens_list = ::llama_tokenize(model, params.prompt, true, true, true);

const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

Expand Down
2 changes: 1 addition & 1 deletion examples/beam-search/beam-search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ int main(int argc, char ** argv)
// Tokenize the prompt :
//---------------------------------

std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true, true, true);

const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
Expand Down
2 changes: 1 addition & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false);
auto inp = ::llama_tokenize(ctx, prompt, true, false, true);
if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
Expand Down
2 changes: 1 addition & 1 deletion examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);

auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
Expand Down
14 changes: 7 additions & 7 deletions examples/infill/infill.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ int main(int argc, char ** argv) {
suff_rm_leading_spc = false;
}
std::vector<llama_token> embd_inp;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
const int space_token = 29871;
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
Expand Down Expand Up @@ -280,10 +280,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

original_prompt_len = original_inp.size();
Expand Down Expand Up @@ -630,8 +630,8 @@ int main(int argc, char ** argv) {
suff_rm_leading_spc = false;
}
// tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
Expand Down Expand Up @@ -703,7 +703,7 @@ int main(int argc, char ** argv) {

const size_t original_size = embd_inp.size();

const auto line_inp = ::llama_tokenize(ctx, buffer, false);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, true, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
Expand Down
8 changes: 4 additions & 4 deletions examples/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {

static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true, add_bos);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}
Expand Down Expand Up @@ -156,14 +156,14 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
Expand All @@ -173,7 +173,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:";
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
Expand Down
2 changes: 1 addition & 1 deletion examples/lookahead/lookahead.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp;
std::vector<llama_token> all;

inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
all = inp;

const int max_context_size = llama_n_ctx(ctx);
Expand Down
2 changes: 1 addition & 1 deletion examples/lookup/lookup-create.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ int main(int argc, char ** argv){

// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
fprintf(stderr, "%s: tokenization done\n", __func__);


Expand Down
2 changes: 1 addition & 1 deletion examples/lookup/lookup-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ int main(int argc, char ** argv){

// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);

llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;
Expand Down
2 changes: 1 addition & 1 deletion examples/lookup/lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ int main(int argc, char ** argv){

// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);

llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;
Expand Down
30 changes: 15 additions & 15 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
if (params.chatml) {
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
}
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
Expand All @@ -277,10 +277,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

original_prompt_len = original_inp.size();
Expand Down Expand Up @@ -339,15 +339,15 @@ int main(int argc, char ** argv) {
}

// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true, false);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true, false);

LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());

// chatml prefix & suffix
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true, false);
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true, false);

LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
Expand Down Expand Up @@ -418,7 +418,7 @@ int main(int argc, char ** argv) {
for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true, false);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
Expand All @@ -433,7 +433,7 @@ int main(int argc, char ** argv) {
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
Expand All @@ -443,7 +443,7 @@ int main(int argc, char ** argv) {
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
Expand Down Expand Up @@ -516,7 +516,7 @@ int main(int argc, char ** argv) {

antiprompt_ids.reserve(params.antiprompt.size());
for (const std::string & antiprompt : params.antiprompt) {
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true, false));
}

struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
Expand Down Expand Up @@ -801,7 +801,7 @@ int main(int argc, char ** argv) {
if (params.interactive) {
if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true, false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true;
}
Expand Down Expand Up @@ -875,9 +875,9 @@ int main(int argc, char ** argv) {
process_escapes(buffer);
}

const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);

LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

Expand Down
4 changes: 2 additions & 2 deletions examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ int main(int argc, char ** argv) {
}

std::vector<llama_token> tokens_system;
tokens_system = ::llama_tokenize(ctx, k_system, true);
tokens_system = ::llama_tokenize(ctx, k_system, true, true, true);
const int32_t n_tokens_system = tokens_system.size();

llama_seq_id g_seq_id = 0;
Expand Down Expand Up @@ -256,7 +256,7 @@ int main(int argc, char ** argv) {

// do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt;
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false, true, false);

for (size_t i = 0; i < tokens_prompt.size(); ++i) {
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
Expand Down
4 changes: 2 additions & 2 deletions examples/passkey/passkey.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ int main(int argc, char ** argv) {

// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true);

// tokenize the prefix and use it as a sink
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true, true, true).size();

const int n_tokens_all = tokens_list.size();

Expand Down
Loading
Loading