From b592c70deb98c7be52b9b3fd2c378a7e2e68fab3 Mon Sep 17 00:00:00 2001 From: staviq Date: Sun, 8 Oct 2023 02:43:23 +0200 Subject: [PATCH 1/9] Rewrite special token handling from #1931 --- common/common.cpp | 12 ++- common/common.h | 6 +- common/train.cpp | 8 +- examples/main/main.cpp | 14 +-- llama.cpp | 231 ++++++++++++++++++++++++++++++++++++++--- llama.h | 3 +- 6 files changed, 243 insertions(+), 31 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 60b00b5fbb8f1..e0dfad3107899 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -862,21 +862,23 @@ std::tuple llama_init_from_gpt_par std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, - bool add_bos) { - return llama_tokenize(llama_get_model(ctx), text, add_bos); + bool add_bos, + bool allow_special_tokens) { + return llama_tokenize(llama_get_model(ctx), text, add_bos, allow_special_tokens); } std::vector llama_tokenize( const struct llama_model * model, const std::string & text, - bool add_bos) { + bool add_bos, + bool allow_special_tokens) { // upper limit for the number of tokens int n_tokens = text.length() + add_bos; std::vector result(n_tokens); - n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos); + n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, allow_special_tokens); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos); + int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, allow_special_tokens); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); diff --git a/common/common.h b/common/common.h index c802152791797..42b7bd29bc10d 100644 --- a/common/common.h +++ b/common/common.h @@ -151,12 +151,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, - bool add_bos); + bool add_bos, + bool allow_special_tokens = false); std::vector llama_tokenize( const struct llama_model * model, const std::string & text, - bool add_bos); + bool add_bos, + bool allow_special_tokens = false); // tokenizes a token into a piece // should work similar to Python's `tokenizer.id_to_piece` diff --git a/common/train.cpp b/common/train.cpp index 35a4cf9e6cae3..37b1caae13f71 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -863,7 +863,7 @@ size_t tokenize_file( (int) buf.size(), out_tokens.data(), (int) out_tokens.size(), - false); + false,false); if (n_tokens < 0) { out_tokens.resize(-n_tokens); n_tokens = llama_tokenize( @@ -872,7 +872,7 @@ size_t tokenize_file( (int) buf.size(), out_tokens.data(), (int) out_tokens.size(), - false); + false,false); } if (n_tokens >= 0) { out_tokens.resize(n_tokens); @@ -966,7 +966,7 @@ size_t tokenize_file( (int) buf_sample.size(), tok_sample.data(), (int) tok_sample.size(), - false); + false,false); if (n_tokens < 0) { tok_sample.resize(-n_tokens); n_tokens = llama_tokenize(llama_get_model(lctx), @@ -974,7 +974,7 @@ size_t tokenize_file( (int) buf_sample.size(), tok_sample.data(), (int) tok_sample.size(), - false); + false,false); GGML_ASSERT(n_tokens >= 0); } GGML_ASSERT(n_tokens <= (int) tok_sample.size()); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 775a5a201e5b8..aa2d817bf5f05 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -237,7 +237,7 @@ int main(int argc, char ** argv) { if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); - embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos); + embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); } else { LOG("use session tokens\n"); embd_inp = session_tokens; @@ -259,10 +259,10 @@ int main(int argc, char ** argv) { if (ctx_guidance) { LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt)); - guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos, true); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp)); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp)); original_prompt_len = original_inp.size(); @@ -316,8 +316,8 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx)); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx)); @@ -715,7 +715,7 @@ int main(int argc, char ** argv) { if (params.interactive) { if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); is_antiprompt = true; } @@ -780,7 +780,7 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } - const auto line_inp = ::llama_tokenize(ctx, buffer, false); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp)); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); diff --git a/llama.cpp b/llama.cpp index d10656bb801db..499d2de25c1bf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -75,6 +75,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -1154,6 +1155,8 @@ struct llama_vocab { std::unordered_map token_to_id; std::vector id_to_token; + std::unordered_map special_tokens_cache; + std::map, int> bpe_ranks; // default LLaMA special tokens @@ -2063,7 +2066,7 @@ static void llm_load_hparams( } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool allow_special_tokens = false); static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( @@ -2179,6 +2182,74 @@ static void llm_load_vocab( GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); + + // build special tokens cache + { + // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type, + // and will always be correctly labeled in 'added_tokens.json' etc. + // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed + // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer + // are special tokens. + // From testing, this appears to corelate 1:1 with special tokens. + // + for (const auto & t: vocab.token_to_id) + { + const auto & token = t.first; + const auto & id = t.second; + + if( token.length() > 1 ) + { + bool is_tokenizable = false; + + for (unsigned i = 1; i < token.length();) + { + const auto left = token.substr(0, i); + const auto right = token.substr(i); + + // check if we didnt partition in the middle of a utf sequence + auto utf = utf8_len( left.at( left.length() -1 ) ); + + if( utf == 1 ) + { + //fprintf(stderr, "BSTC . '%s' '%s' '%s'\n", token.c_str(), left.c_str(), right.c_str()); + + if (vocab.token_to_id.find( left ) != vocab.token_to_id.end() && + vocab.token_to_id.find( right ) != vocab.token_to_id.end() ) + { + is_tokenizable = true; + break; + } + + i++; + } + else + { + // fprintf(stderr, "BSTC SKIP '%s' '%s' '%s'\n", token.c_str(), left.c_str(), right.c_str()); + // skip over the rest of multibyte utf sequence + i += utf - 1; + } + } + + if (!is_tokenizable) + { + // it's faster to re-filter them here, since there is way less candidates now + size_t utf8_str_len = 0; + for (unsigned i = 0; i < token.length();) + { + utf8_str_len++; + i += utf8_len( token.at(i) ); + } + + if (utf8_str_len > 1) + { + //fprintf(stderr, "BSTC SPECIAL '%s' '%d' ('%ld')\n", token.c_str(), id, utf8_str_len); + + vocab.special_tokens_cache[token] = id; + } + } + } + } + } } static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { @@ -5686,7 +5757,115 @@ struct llm_tokenizer_bpe { llm_bigram_bpe::queue work_queue; }; -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) { +typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{ + FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN, + FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT +} FRAGMENT_BUFFER_VARIANT_TYPE; + +struct fragment_buffer_variant{ + fragment_buffer_variant(llama_vocab::id token) + : + type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), + token(token){} + fragment_buffer_variant(std::string raw_text) + : + type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), + raw_text(raw_text){} + + FRAGMENT_BUFFER_VARIANT_TYPE type; + llama_vocab::id token; + std::string raw_text; +}; + +void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) +{ + // for each special token + for( const auto & st: vocab.special_tokens_cache ) + { + const auto & special_token = st.first; + const auto & special_id = st.second; + + // for each text fragment + //for (auto & fragment: buffer) + std::forward_list::iterator it = buffer.begin(); + while (it != buffer.end()) + { + auto & fragment = (*it); + // if a fragment is text ( not yet processed ) + if( fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) + { + auto * raw_text = &(fragment.raw_text); + + // loop over the text + while(true) + { + // find the first occurence of a given special token in this fragment + auto match = raw_text->find( special_token ); + + // no occurences found, stop processing this fragment for a given special token + if (match == std::string::npos) + { + break; + } + + auto source = std::distance( buffer.begin(), it ); + + if( match > 0 ) + { + // left + buffer.emplace_after(it, raw_text->substr(0, match)); + it++; + } + + // special token + buffer.emplace_after(it, special_id); + it++; + + + // right + if (match + special_token.length() < raw_text->length()) + { + buffer.emplace_after(it, raw_text->substr(match + special_token.length())); + it++; + + if (source == 0) + { + buffer.erase_after(buffer.before_begin()); + } + else + { + auto prev = std::prev( buffer.begin(), -(source-1) ); + buffer.erase_after(prev); + } + //it = std::prev( it, 1 ); + + // repeat for the right side + raw_text = &((*it).raw_text); + } + else + { + if (source == 0) + { + buffer.erase_after(buffer.before_begin()); + } + else + { + auto prev = std::prev( buffer.begin(), -(source) ); + buffer.erase_after(prev); + } + //it = std::prev( it, 1 ); + + break; + } + } + } + + it++; + } + } +} + +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool allow_special_tokens) { std::vector output; // OG tokenizer behavior: @@ -5702,20 +5881,48 @@ static std::vector llama_tokenize_internal(const llama_vocab & return output; } + std::forward_list fragment_buffer; + + fragment_buffer.emplace_front( raw_text ); + + if (allow_special_tokens) { + tokenizer_st_partition( vocab, fragment_buffer ); + } + switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { - // without adding this leading whitespace, we do not get the same results as the original tokenizer - raw_text = " " + raw_text; + for (const auto & fragment: fragment_buffer) + { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) + { + // without adding this leading whitespace, we do not get the same results as the original tokenizer + auto raw_text = " " + fragment.raw_text; - llm_tokenizer_spm tokenizer(vocab); - llama_escape_whitespace(raw_text); - tokenizer.tokenize(raw_text, output); + llm_tokenizer_spm tokenizer(vocab); + llama_escape_whitespace(raw_text); + tokenizer.tokenize(raw_text, output); + } + else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + { + output.push_back(fragment.token); + } + } } break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe tokenizer(vocab); - tokenizer.tokenize(raw_text, output); + for (const auto & fragment: fragment_buffer) + { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) + { + llm_tokenizer_bpe tokenizer(vocab); + tokenizer.tokenize(raw_text, output); + } + else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + { + output.push_back(fragment.token); + } + } } break; } @@ -8629,15 +8836,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) { return ctx->model.vocab.special_eot_id; } - int llama_tokenize( const struct llama_model * model, const char * text, int text_len, llama_token * tokens, int n_max_tokens, - bool add_bos) { - auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos); + bool add_bos, + bool allow_special_tokens) { + auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, allow_special_tokens); if (n_max_tokens < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); diff --git a/llama.h b/llama.h index a78015adab30c..ef1a33da5327b 100644 --- a/llama.h +++ b/llama.h @@ -521,7 +521,8 @@ extern "C" { int text_len, llama_token * tokens, int n_max_tokens, - bool add_bos); + bool add_bos, + bool allow_special_tokens); // Token Id -> Piece. // Uses the vocabulary in the provided context. From fc634d87a8904b0844fbb71eb045d0f26d8bfd94 Mon Sep 17 00:00:00 2001 From: staviq Date: Tue, 10 Oct 2023 16:34:24 +0200 Subject: [PATCH 2/9] shorten param name, add st verification by type --- common/common.cpp | 10 +++---- common/common.h | 4 +-- llama.cpp | 67 ++++++++++++++++++++++++++++++++++++++--------- llama.h | 11 ++++---- 4 files changed, 67 insertions(+), 25 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index e0dfad3107899..738659b69e9ab 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -863,22 +863,22 @@ std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, bool add_bos, - bool allow_special_tokens) { - return llama_tokenize(llama_get_model(ctx), text, add_bos, allow_special_tokens); + bool special) { + return llama_tokenize(llama_get_model(ctx), text, add_bos, special); } std::vector llama_tokenize( const struct llama_model * model, const std::string & text, bool add_bos, - bool allow_special_tokens) { + bool special) { // upper limit for the number of tokens int n_tokens = text.length() + add_bos; std::vector result(n_tokens); - n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, allow_special_tokens); + n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, allow_special_tokens); + int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); diff --git a/common/common.h b/common/common.h index 42b7bd29bc10d..d178301066470 100644 --- a/common/common.h +++ b/common/common.h @@ -152,13 +152,13 @@ std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, bool add_bos, - bool allow_special_tokens = false); + bool special = false); std::vector llama_tokenize( const struct llama_model * model, const std::string & text, bool add_bos, - bool allow_special_tokens = false); + bool special = false); // tokenizes a token into a piece // should work similar to Python's `tokenizer.id_to_piece` diff --git a/llama.cpp b/llama.cpp index 499d2de25c1bf..d6888994e53a3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2066,7 +2066,7 @@ static void llm_load_hparams( } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool allow_special_tokens = false); +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false); static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( @@ -2192,15 +2192,30 @@ static void llm_load_vocab( // are special tokens. // From testing, this appears to corelate 1:1 with special tokens. // + + // Counting special tokens and verifying in only one direction + // is sufficient to detect difference in those two sets. + // + uint32_t special_tokens_count_by_type = 0; + uint32_t special_tokens_count_from_verification = 0; + bool special_tokens_definition_mismatch = false; + for (const auto & t: vocab.token_to_id) { const auto & token = t.first; const auto & id = t.second; + // Count all non-normal tokens in the vocab while iterating + if( vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL ) + special_tokens_count_by_type++; + + // Skip single character tokens if( token.length() > 1 ) { bool is_tokenizable = false; + // Split token string representation in two, in all possible ways + // and check if both halves can be matched to a valid token for (unsigned i = 1; i < token.length();) { const auto left = token.substr(0, i); @@ -2211,8 +2226,6 @@ static void llm_load_vocab( if( utf == 1 ) { - //fprintf(stderr, "BSTC . '%s' '%s' '%s'\n", token.c_str(), left.c_str(), right.c_str()); - if (vocab.token_to_id.find( left ) != vocab.token_to_id.end() && vocab.token_to_id.find( right ) != vocab.token_to_id.end() ) { @@ -2224,7 +2237,6 @@ static void llm_load_vocab( } else { - // fprintf(stderr, "BSTC SKIP '%s' '%s' '%s'\n", token.c_str(), left.c_str(), right.c_str()); // skip over the rest of multibyte utf sequence i += utf - 1; } @@ -2232,7 +2244,10 @@ static void llm_load_vocab( if (!is_tokenizable) { - // it's faster to re-filter them here, since there is way less candidates now + // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1 + // it's faster to re-filter them here, since there are way less candidates now + + // Calculate a total "utf" length of a token string representation size_t utf8_str_len = 0; for (unsigned i = 0; i < token.length();) { @@ -2240,15 +2255,39 @@ static void llm_load_vocab( i += utf8_len( token.at(i) ); } + // And skip the ones which are one character if (utf8_str_len > 1) { - //fprintf(stderr, "BSTC SPECIAL '%s' '%d' ('%ld')\n", token.c_str(), id, utf8_str_len); + // At this point what we have left are special tokens only vocab.special_tokens_cache[token] = id; + + // Count manually found special tokens + special_tokens_count_from_verification ++; + + // If this manually found special token is not marked as such, flag a mismatch + if( vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL ) + special_tokens_definition_mismatch = true; } } } } + + if( special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type ) + { + fprintf(stderr, "%s: WARNING: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n", + __func__, + special_tokens_count_from_verification, vocab.id_to_token.size(), + special_tokens_count_by_type, vocab.id_to_token.size() + ); + } + else + { + fprintf(stderr, "%s: Special tokens definition check successful ( %u/%zu ).\n", + __func__, + special_tokens_count_from_verification, vocab.id_to_token.size() + ); + } } } @@ -5777,7 +5816,7 @@ struct fragment_buffer_variant{ std::string raw_text; }; -void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) +static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token for( const auto & st: vocab.special_tokens_cache ) @@ -5834,7 +5873,8 @@ void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool allow_special_tokens) { +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) { std::vector output; // OG tokenizer behavior: @@ -5885,7 +5926,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & fragment_buffer.emplace_front( raw_text ); - if (allow_special_tokens) { + if (special) { tokenizer_st_partition( vocab, fragment_buffer ); } @@ -8843,8 +8884,8 @@ int llama_tokenize( llama_token * tokens, int n_max_tokens, bool add_bos, - bool allow_special_tokens) { - auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, allow_special_tokens); + bool special) { + auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); if (n_max_tokens < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); diff --git a/llama.h b/llama.h index ef1a33da5327b..b88ce9e4218d0 100644 --- a/llama.h +++ b/llama.h @@ -511,10 +511,11 @@ extern "C" { // Tokenization // - // Convert the provided text into tokens. - // The tokens pointer must be large enough to hold the resulting tokens. - // Returns the number of tokens on success, no more than n_max_tokens - // Returns a negative number on failure - the number of tokens that would have been returned + /// @details Convert the provided text into tokens. + /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. + /// @return Returns the number of tokens on success, no more than n_max_tokens + /// @return Returns a negative number on failure - the number of tokens that would have been returned + /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. LLAMA_API int llama_tokenize( const struct llama_model * model, const char * text, @@ -522,7 +523,7 @@ extern "C" { llama_token * tokens, int n_max_tokens, bool add_bos, - bool allow_special_tokens); + bool special); // Token Id -> Piece. // Uses the vocabulary in the provided context. From 29e6b46e03168e76dce52247f659e40b6878ec4a Mon Sep 17 00:00:00 2001 From: staviq Date: Wed, 11 Oct 2023 22:47:17 +0200 Subject: [PATCH 3/9] use offsets instead of copy by substr --- llama.cpp | 128 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 111 insertions(+), 17 deletions(-) diff --git a/llama.cpp b/llama.cpp index d6888994e53a3..85c8bdedd2981 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5802,20 +5802,35 @@ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{ } FRAGMENT_BUFFER_VARIANT_TYPE; struct fragment_buffer_variant{ - fragment_buffer_variant(llama_vocab::id token) + fragment_buffer_variant(llama_vocab::id _token) : type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN), - token(token){} - fragment_buffer_variant(std::string raw_text) + token(_token), + raw_text(_dummy), + offset(0), + length(0){} + fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length) : type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT), - raw_text(raw_text){} - - FRAGMENT_BUFFER_VARIANT_TYPE type; - llama_vocab::id token; - std::string raw_text; + token((llama_vocab::id)-1), + raw_text(_raw_text), + offset(_offset), + length(_length){ + GGML_ASSERT( _offset >= 0 ); + GGML_ASSERT( _length >= 1 ); + GGML_ASSERT( offset + length <= raw_text.length() ); + } + + const FRAGMENT_BUFFER_VARIANT_TYPE type; + const llama_vocab::id token; + const std::string _dummy; + const std::string & raw_text; + const uint64_t offset; + const uint64_t length; }; +#define PRETOKENIZERDEBUG + static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token @@ -5834,12 +5849,16 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if( fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) { auto * raw_text = &(fragment.raw_text); + auto raw_text_base_offset = fragment.offset; + auto raw_text_base_length = fragment.length; // loop over the text while(true) { // find the first occurence of a given special token in this fragment - auto match = raw_text->find( special_token ); + // passing offset argument only limit the "search area" but match coordinates + // are still relative to the source full raw_text + auto match = raw_text->find( special_token, raw_text_base_offset ); // no occurences found, stop processing this fragment for a given special token if (match == std::string::npos) @@ -5847,12 +5866,33 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< break; } + // check if match is within bounds of offset <-> length + if( match + special_token.length() > raw_text_base_offset + raw_text_base_length ) + { + // match is out of bounds + break; + } + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); +#endif + auto source = std::distance( buffer.begin(), it ); - if( match > 0 ) + // if match is further than base offset + // then we have some text to the left of it + if( match > raw_text_base_offset ) { // left - buffer.emplace_after(it, raw_text->substr(0, match)); + //buffer.emplace_after(it, raw_text->substr(0, match)); + const int64_t left_reminder_offset = raw_text_base_offset + 0; + const int64_t left_reminder_length = match - raw_text_base_offset; + buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); +#endif + it++; } @@ -5862,37 +5902,76 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // right - if (match + special_token.length() < raw_text->length()) + if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { - buffer.emplace_after(it, raw_text->substr(match + special_token.length())); + /* + | | + ------------------------------------------------------------------------- + . |ttttt| | + */ + //buffer.emplace_after(it, raw_text->substr(match + special_token.length())); + const int64_t right_reminder_offset = match + special_token.length(); + const int64_t right_reminder_length = raw_text_base_length - ( ( match - raw_text_base_offset ) + special_token.length() ); + buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); +#endif + it++; if (source == 0) { + // TODO? It might not be needed to store/restore the iterator like this + // but this gives me the peace of mind I'm not causing some + // accidental undefined behaviour. + auto it_backup = std::distance( buffer.begin(), it ); + buffer.erase_after(buffer.before_begin()); + + it = std::next( buffer.begin(), it_backup-1 ); } else { + auto it_backup = std::distance( buffer.begin(), it ); + //auto prev = std::prev( buffer.begin(), -(source-1) ); auto prev = std::next( buffer.begin(), (source-1) ); buffer.erase_after(prev); + + it = std::next( buffer.begin(), it_backup-1 ); } //it = std::prev( it, 1 ); // repeat for the right side - raw_text = &((*it).raw_text); + raw_text_base_offset = right_reminder_offset; //match + special_token.length(); + raw_text_base_length = right_reminder_length; //right_reminder_length - ( ( match + special_token.length() ) - raw_text_base_offset ); + //raw_text = &((*it).raw_text); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); +#endif + } else { if (source == 0) { + auto it_backup = std::distance( buffer.begin(), it ); + buffer.erase_after(buffer.before_begin()); + + it = std::next( buffer.begin(), it_backup-1 ); } else { + auto it_backup = std::distance( buffer.begin(), it ); + //auto prev = std::prev( buffer.begin(), -(source) ); - auto prev = std::next( buffer.begin(), (source) ); + auto prev = std::next( buffer.begin(), (source-1) ); buffer.erase_after(prev); + + it = std::next( buffer.begin(), it_backup-1 ); } //it = std::prev( it, 1 ); @@ -5924,7 +6003,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & std::forward_list fragment_buffer; - fragment_buffer.emplace_front( raw_text ); + fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); if (special) { tokenizer_st_partition( vocab, fragment_buffer ); @@ -5938,7 +6017,16 @@ static std::vector llama_tokenize_internal(const llama_vocab & if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer - auto raw_text = " " + fragment.raw_text; + + // TODO: It's likely possible to get rid of this string copy entirely + // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer + // and passing 'add space prefix' as bool argument + // + auto raw_text = " " + fragment.raw_text.substr(fragment.offset, fragment.length); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); @@ -5956,6 +6044,12 @@ static std::vector llama_tokenize_internal(const llama_vocab & { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + +#ifdef PRETOKENIZERDEBUG + fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); +#endif + llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); } From f7b1205a515ca24cd8af1ecec94b697f275400db Mon Sep 17 00:00:00 2001 From: staviq Date: Thu, 12 Oct 2023 00:26:44 +0200 Subject: [PATCH 4/9] formatting, remove copying iterator on delete --- llama.cpp | 112 +++++++++++------------------------------------------- 1 file changed, 23 insertions(+), 89 deletions(-) diff --git a/llama.cpp b/llama.cpp index ffe085ebe5bf3..59593f1e8a7d0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2268,11 +2268,11 @@ static void llm_load_vocab( const auto & id = t.second; // Count all non-normal tokens in the vocab while iterating - if( vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL ) + if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) special_tokens_count_by_type++; // Skip single character tokens - if( token.length() > 1 ) + if (token.length() > 1) { bool is_tokenizable = false; @@ -2284,9 +2284,9 @@ static void llm_load_vocab( const auto right = token.substr(i); // check if we didnt partition in the middle of a utf sequence - auto utf = utf8_len( left.at( left.length() -1 ) ); + auto utf = utf8_len(left.at(left.length() - 1)); - if( utf == 1 ) + if (utf == 1) { if (vocab.token_to_id.find( left ) != vocab.token_to_id.end() && vocab.token_to_id.find( right ) != vocab.token_to_id.end() ) @@ -2294,7 +2294,6 @@ static void llm_load_vocab( is_tokenizable = true; break; } - i++; } else @@ -2314,21 +2313,20 @@ static void llm_load_vocab( for (unsigned i = 0; i < token.length();) { utf8_str_len++; - i += utf8_len( token.at(i) ); + i += utf8_len(token.at(i)); } // And skip the ones which are one character if (utf8_str_len > 1) { // At this point what we have left are special tokens only - vocab.special_tokens_cache[token] = id; // Count manually found special tokens special_tokens_count_from_verification ++; // If this manually found special token is not marked as such, flag a mismatch - if( vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL ) + if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) special_tokens_definition_mismatch = true; } } @@ -2337,7 +2335,7 @@ static void llm_load_vocab( if( special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type ) { - fprintf(stderr, "%s: WARNING: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n", + fprintf(stderr, "warning: %s: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n", __func__, special_tokens_count_from_verification, vocab.id_to_token.size(), special_tokens_count_by_type, vocab.id_to_token.size() @@ -6608,31 +6606,30 @@ struct fragment_buffer_variant{ const uint64_t length; }; -#define PRETOKENIZERDEBUG +// #define PRETOKENIZERDEBUG static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token - for( const auto & st: vocab.special_tokens_cache ) + for (const auto & st: vocab.special_tokens_cache) { const auto & special_token = st.first; const auto & special_id = st.second; // for each text fragment - //for (auto & fragment: buffer) std::forward_list::iterator it = buffer.begin(); while (it != buffer.end()) { auto & fragment = (*it); // if a fragment is text ( not yet processed ) - if( fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto * raw_text = &(fragment.raw_text); auto raw_text_base_offset = fragment.offset; auto raw_text_base_length = fragment.length; // loop over the text - while(true) + while (true) { // find the first occurence of a given special token in this fragment // passing offset argument only limit the "search area" but match coordinates @@ -6640,30 +6637,21 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< auto match = raw_text->find( special_token, raw_text_base_offset ); // no occurences found, stop processing this fragment for a given special token - if (match == std::string::npos) - { - break; - } + if (match == std::string::npos) break; // check if match is within bounds of offset <-> length - if( match + special_token.length() > raw_text_base_offset + raw_text_base_length ) - { - // match is out of bounds - break; - } + if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break; #ifdef PRETOKENIZERDEBUG fprintf(stderr,"FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif - - auto source = std::distance( buffer.begin(), it ); + auto source = std::distance(buffer.begin(), it); // if match is further than base offset // then we have some text to the left of it - if( match > raw_text_base_offset ) + if (match > raw_text_base_offset) { // left - //buffer.emplace_after(it, raw_text->substr(0, match)); const int64_t left_reminder_offset = raw_text_base_offset + 0; const int64_t left_reminder_length = match - raw_text_base_offset; buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length); @@ -6671,26 +6659,18 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< #ifdef PRETOKENIZERDEBUG fprintf(stderr,"FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); #endif - it++; } // special token buffer.emplace_after(it, special_id); it++; - // right if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { - /* - | | - ------------------------------------------------------------------------- - . |ttttt| | - */ - //buffer.emplace_after(it, raw_text->substr(match + special_token.length())); const int64_t right_reminder_offset = match + special_token.length(); - const int64_t right_reminder_length = raw_text_base_length - ( ( match - raw_text_base_offset ) + special_token.length() ); + const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length); #ifdef PRETOKENIZERDEBUG @@ -6699,66 +6679,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< it++; - if (source == 0) - { - // TODO? It might not be needed to store/restore the iterator like this - // but this gives me the peace of mind I'm not causing some - // accidental undefined behaviour. - auto it_backup = std::distance( buffer.begin(), it ); - - buffer.erase_after(buffer.before_begin()); - - it = std::next( buffer.begin(), it_backup-1 ); - } - else - { - auto it_backup = std::distance( buffer.begin(), it ); - - //auto prev = std::prev( buffer.begin(), -(source-1) ); - auto prev = std::next( buffer.begin(), (source-1) ); - buffer.erase_after(prev); - - it = std::next( buffer.begin(), it_backup-1 ); - } - //it = std::prev( it, 1 ); + if (source == 0) buffer.erase_after(buffer.before_begin()); + else buffer.erase_after(std::next(buffer.begin(), (source-1))); // repeat for the right side - raw_text_base_offset = right_reminder_offset; //match + special_token.length(); - raw_text_base_length = right_reminder_length; //right_reminder_length - ( ( match + special_token.length() ) - raw_text_base_offset ); - //raw_text = &((*it).raw_text); + raw_text_base_offset = right_reminder_offset; + raw_text_base_length = right_reminder_length; #ifdef PRETOKENIZERDEBUG fprintf(stderr,"RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif - } else { - if (source == 0) - { - auto it_backup = std::distance( buffer.begin(), it ); - - buffer.erase_after(buffer.before_begin()); - - it = std::next( buffer.begin(), it_backup-1 ); - } - else - { - auto it_backup = std::distance( buffer.begin(), it ); - - //auto prev = std::prev( buffer.begin(), -(source) ); - auto prev = std::next( buffer.begin(), (source-1) ); - buffer.erase_after(prev); - - it = std::next( buffer.begin(), it_backup-1 ); - } - //it = std::prev( it, 1 ); - + if (source == 0) buffer.erase_after(buffer.before_begin()); + else buffer.erase_after(std::next(buffer.begin(), (source-1))); break; } } } - it++; } } @@ -6781,12 +6720,9 @@ static std::vector llama_tokenize_internal(const llama_vocab & } std::forward_list fragment_buffer; - fragment_buffer.emplace_front( raw_text, 0, raw_text.length() ); - if (special) { - tokenizer_st_partition( vocab, fragment_buffer ); - } + if (special) tokenizer_st_partition( vocab, fragment_buffer ); switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: @@ -6806,7 +6742,6 @@ static std::vector llama_tokenize_internal(const llama_vocab & #ifdef PRETOKENIZERDEBUG fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); tokenizer.tokenize(raw_text, output); @@ -6828,7 +6763,6 @@ static std::vector llama_tokenize_internal(const llama_vocab & #ifdef PRETOKENIZERDEBUG fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif - llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); } From 5c6b2be11fc3b347ddd0249945948d9b07742e07 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Oct 2023 14:47:29 +0300 Subject: [PATCH 5/9] llama : normalize code-style --- common/train.cpp | 8 ++-- examples/main/main.cpp | 2 +- llama.cpp | 102 +++++++++++++++++++---------------------- 3 files changed, 51 insertions(+), 61 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index 37b1caae13f71..972eaefe00f05 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -863,7 +863,7 @@ size_t tokenize_file( (int) buf.size(), out_tokens.data(), (int) out_tokens.size(), - false,false); + false, false); if (n_tokens < 0) { out_tokens.resize(-n_tokens); n_tokens = llama_tokenize( @@ -872,7 +872,7 @@ size_t tokenize_file( (int) buf.size(), out_tokens.data(), (int) out_tokens.size(), - false,false); + false, false); } if (n_tokens >= 0) { out_tokens.resize(n_tokens); @@ -966,7 +966,7 @@ size_t tokenize_file( (int) buf_sample.size(), tok_sample.data(), (int) tok_sample.size(), - false,false); + false, false); if (n_tokens < 0) { tok_sample.resize(-n_tokens); n_tokens = llama_tokenize(llama_get_model(lctx), @@ -974,7 +974,7 @@ size_t tokenize_file( (int) buf_sample.size(), tok_sample.data(), (int) tok_sample.size(), - false,false); + false, false); GGML_ASSERT(n_tokens >= 0); } GGML_ASSERT(n_tokens <= (int) tok_sample.size()); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 655c79d29a195..af326bba81b47 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -321,7 +321,7 @@ int main(int argc, char ** argv) { // prefix & suffix for instruct mode const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx)); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx)); diff --git a/llama.cpp b/llama.cpp index 59593f1e8a7d0..1f82dcb6d2d25 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2260,89 +2260,79 @@ static void llm_load_vocab( // uint32_t special_tokens_count_by_type = 0; uint32_t special_tokens_count_from_verification = 0; + bool special_tokens_definition_mismatch = false; - for (const auto & t: vocab.token_to_id) - { + for (const auto & t : vocab.token_to_id) { const auto & token = t.first; - const auto & id = t.second; + const auto & id = t.second; // Count all non-normal tokens in the vocab while iterating - if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) + if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) { special_tokens_count_by_type++; + } // Skip single character tokens - if (token.length() > 1) - { + if (token.length() > 1) { bool is_tokenizable = false; // Split token string representation in two, in all possible ways // and check if both halves can be matched to a valid token - for (unsigned i = 1; i < token.length();) - { - const auto left = token.substr(0, i); + for (unsigned i = 1; i < token.length();) { + const auto left = token.substr(0, i); const auto right = token.substr(i); // check if we didnt partition in the middle of a utf sequence auto utf = utf8_len(left.at(left.length() - 1)); - if (utf == 1) - { - if (vocab.token_to_id.find( left ) != vocab.token_to_id.end() && - vocab.token_to_id.find( right ) != vocab.token_to_id.end() ) - { + if (utf == 1) { + if (vocab.token_to_id.find(left) != vocab.token_to_id.end() && + vocab.token_to_id.find(right) != vocab.token_to_id.end() ) { is_tokenizable = true; break; } i++; - } - else - { + } else { // skip over the rest of multibyte utf sequence i += utf - 1; } } - if (!is_tokenizable) - { + if (!is_tokenizable) { // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1 // it's faster to re-filter them here, since there are way less candidates now // Calculate a total "utf" length of a token string representation size_t utf8_str_len = 0; - for (unsigned i = 0; i < token.length();) - { + for (unsigned i = 0; i < token.length();) { utf8_str_len++; i += utf8_len(token.at(i)); } // And skip the ones which are one character - if (utf8_str_len > 1) - { + if (utf8_str_len > 1) { // At this point what we have left are special tokens only vocab.special_tokens_cache[token] = id; // Count manually found special tokens - special_tokens_count_from_verification ++; + special_tokens_count_from_verification++; // If this manually found special token is not marked as such, flag a mismatch - if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) + if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) { special_tokens_definition_mismatch = true; + } } } } } - if( special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type ) - { - fprintf(stderr, "warning: %s: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n", + if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) { + fprintf(stderr, "%s: warning: Mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n", __func__, special_tokens_count_from_verification, vocab.id_to_token.size(), special_tokens_count_by_type, vocab.id_to_token.size() ); - } - else - { + } else { fprintf(stderr, "%s: Special tokens definition check successful ( %u/%zu ).\n", __func__, special_tokens_count_from_verification, vocab.id_to_token.size() @@ -6611,30 +6601,28 @@ struct fragment_buffer_variant{ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token - for (const auto & st: vocab.special_tokens_cache) - { + for (const auto & st: vocab.special_tokens_cache) { const auto & special_token = st.first; - const auto & special_id = st.second; + const auto & special_id = st.second; // for each text fragment std::forward_list::iterator it = buffer.begin(); - while (it != buffer.end()) - { + while (it != buffer.end()) { auto & fragment = (*it); + // if a fragment is text ( not yet processed ) - if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) - { + if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto * raw_text = &(fragment.raw_text); + auto raw_text_base_offset = fragment.offset; auto raw_text_base_length = fragment.length; // loop over the text - while (true) - { + while (true) { // find the first occurence of a given special token in this fragment // passing offset argument only limit the "search area" but match coordinates // are still relative to the source full raw_text - auto match = raw_text->find( special_token, raw_text_base_offset ); + auto match = raw_text->find(special_token, raw_text_base_offset); // no occurences found, stop processing this fragment for a given special token if (match == std::string::npos) break; @@ -6643,21 +6631,20 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break; #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); + fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif auto source = std::distance(buffer.begin(), it); // if match is further than base offset // then we have some text to the left of it - if (match > raw_text_base_offset) - { + if (match > raw_text_base_offset) { // left const int64_t left_reminder_offset = raw_text_base_offset + 0; const int64_t left_reminder_length = match - raw_text_base_offset; buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); + fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); #endif it++; } @@ -6667,33 +6654,36 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< it++; // right - if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) - { + if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { const int64_t right_reminder_offset = match + special_token.length(); const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); + fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); #endif it++; - if (source == 0) buffer.erase_after(buffer.before_begin()); - else buffer.erase_after(std::next(buffer.begin(), (source-1))); + if (source == 0) { + buffer.erase_after(buffer.before_begin()); + } else { + buffer.erase_after(std::next(buffer.begin(), (source-1))); + } // repeat for the right side raw_text_base_offset = right_reminder_offset; raw_text_base_length = right_reminder_length; #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); + fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif - } - else - { - if (source == 0) buffer.erase_after(buffer.before_begin()); - else buffer.erase_after(std::next(buffer.begin(), (source-1))); + } else { + if (source == 0) { + buffer.erase_after(buffer.before_begin()); + } else { + buffer.erase_after(std::next(buffer.begin(), (source-1))); + } break; } } From 0f1c569540be0618d0a34c1e2022d81d4f9b1e6f Mon Sep 17 00:00:00 2001 From: staviq Date: Thu, 12 Oct 2023 16:17:09 +0200 Subject: [PATCH 6/9] swift fix --- examples/batched.swift/Sources/main.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 938f30512ca6a..05d1bb9d00068 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -209,7 +209,7 @@ llama_print_timings(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let n_tokens = text.count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos) + let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) var swiftTokens: [llama_token] = [] for i in 0 ..< tokenCount { swiftTokens.append(tokens[Int(i)]) From 5974d617c09e83560c37715fa54cb5176ff4edec Mon Sep 17 00:00:00 2001 From: staviq Date: Thu, 12 Oct 2023 17:49:15 +0200 Subject: [PATCH 7/9] print pfx/sfx if verb, main: split pfx input sfx --- examples/main/main.cpp | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index af326bba81b47..a5fb65548ff4f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -383,6 +383,12 @@ int main(int argc, char ** argv) { if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); + if (params.verbose_prompt) { + auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } } } @@ -392,10 +398,22 @@ int main(int argc, char ** argv) { if (!params.input_prefix.empty()) { LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + if (params.verbose_prompt) { + auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } } if (!params.input_suffix.empty()) { LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + if (params.verbose_prompt) { + auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } } } LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", @@ -744,8 +762,7 @@ int main(int argc, char ** argv) { std::string buffer; if (!params.input_prefix.empty()) { LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - buffer += params.input_prefix; - printf("%s", buffer.c_str()); + printf("%s", params.input_prefix.c_str()); } // color user input only @@ -767,7 +784,6 @@ int main(int argc, char ** argv) { // append input suffix if any if (!params.input_suffix.empty()) { LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - buffer += params.input_suffix; printf("%s", params.input_suffix.c_str()); } @@ -782,10 +798,14 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } - const auto line_inp = ::llama_tokenize(ctx, buffer, false, true); + const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp)); + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); // instruct mode: insert response suffix if (params.instruct) { From 1c28116de410ad689fdf26d637be6f0530107cb0 Mon Sep 17 00:00:00 2001 From: staviq Date: Fri, 13 Oct 2023 01:14:23 +0200 Subject: [PATCH 8/9] dont add space when using special tokens --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 1f82dcb6d2d25..b571d4eb286d7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6727,7 +6727,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer // and passing 'add space prefix' as bool argument // - auto raw_text = " " + fragment.raw_text.substr(fragment.offset, fragment.length); + auto raw_text = (special?"":" ") + fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); From fc82541b1de0f625a0d3444563108c5b7e8bfd6b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Oct 2023 18:08:57 +0300 Subject: [PATCH 9/9] minor : comment + spacing --- llama.cpp | 2 +- llama.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index b571d4eb286d7..e894391f95f16 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6727,7 +6727,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer // and passing 'add space prefix' as bool argument // - auto raw_text = (special?"":" ") + fragment.raw_text.substr(fragment.offset, fragment.length); + auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); diff --git a/llama.h b/llama.h index b88ce9e4218d0..b13f231233907 100644 --- a/llama.h +++ b/llama.h @@ -516,6 +516,7 @@ extern "C" { /// @return Returns the number of tokens on success, no more than n_max_tokens /// @return Returns a negative number on failure - the number of tokens that would have been returned /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. + /// Does not insert a leading space. LLAMA_API int llama_tokenize( const struct llama_model * model, const char * text,