Skip to content

Commit

Permalink
tokenization: no double BOS tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed May 6, 2024
1 parent 858f6b7 commit 149603e
Showing 1 changed file with 10 additions and 10 deletions.
20 changes: 10 additions & 10 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12674,11 +12674,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []

if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
}

for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer
Expand All @@ -12705,18 +12700,18 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
}
}

if (add_special && vocab.special_add_bos != 0 && output[0] != vocab.special_bos_id) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.insert(output.begin(), vocab.special_bos_id);
}

if (add_special && vocab.special_add_eos == 1) {
GGML_ASSERT(vocab.special_eos_id != -1);
output.push_back(vocab.special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
}

for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
Expand All @@ -12731,6 +12726,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
}
}

if (add_special && vocab.special_add_bos != 0 && output[0] != vocab.special_bos_id) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.insert(output.begin(), vocab.special_bos_id);
}

GGML_ASSERT(vocab.special_add_eos != 1);
} break;
case LLAMA_VOCAB_TYPE_WPM:
Expand Down

0 comments on commit 149603e

Please sign in to comment.