Skip to content

Commit

Permalink
Discard all tokens when no matching found
Browse files Browse the repository at this point in the history
  • Loading branch information
jaime-m-p committed May 27, 2024
1 parent 117b091 commit f3f6c0a
Showing 1 changed file with 13 additions and 15 deletions.
28 changes: 13 additions & 15 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12660,7 +12660,7 @@ struct llm_tokenizer_wpm {
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}

void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
auto * token_map = &vocab.token_to_id;
const auto & token_map = vocab.token_to_id;

// normalize and split by whitespace
std::vector<std::string> words = preprocess(text);
Expand All @@ -12675,36 +12675,34 @@ struct llm_tokenizer_wpm {
}

// prepend phantom space
std::string word1 = "\xe2\x96\x81" + word;
int n = word1.size();
const std::string word1 = "\xe2\x96\x81" + word;
const int n = word1.size();

// we're at the start of a new word
int i = 0;
bool match_any = false;
const size_t current_tokens = output.size();

// we're at the start of a new word
// move through character position in word
while (i < n) {
for (int i = 0; i < n; ++i) {
// loop through possible match length
bool match = false;
for (int j = n; j > i; j--) {
auto it = token_map->find(word1.substr(i, j - i));
if (it != token_map->end()) {
auto it = token_map.find(word1.substr(i, j - i));
if (it != token_map.end()) {
output.push_back(it->second);
match = true;
match_any = true;
i = j;
i = j - 1;
break;
}
}

// must be an unknown character
if (!match) {
i++;
if (!match) { // discard all
output.resize(current_tokens);
break; // and discard next tokens
}
}

// we didn't find any matches for this word
if (!match_any) {
if (current_tokens == output.size()) {
output.push_back(vocab.special_unk_id);
}
}
Expand Down

0 comments on commit f3f6c0a

Please sign in to comment.