From c75eafc41a6c64ada25a66f8dbf2eaec2a7a1531 Mon Sep 17 00:00:00 2001 From: HelloRusk Date: Wed, 16 Mar 2022 21:55:10 +0900 Subject: [PATCH 1/6] Fixing the vocab size of the trained Unigram model --- tokenizers/src/models/unigram/trainer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs index c76448b63..f2d92c754 100644 --- a/tokenizers/src/models/unigram/trainer.rs +++ b/tokenizers/src/models/unigram/trainer.rs @@ -132,7 +132,7 @@ impl UnigramTrainer { } inserted.insert(token.to_string()); pieces.push((token.to_string(), if score.is_nan() { 0.0 } else { *score })); - if pieces.len() == self.vocab_size as usize { + if pieces.len() == self.vocab_size as usize - self.special_tokens.len() { break; } } From fb8955c831b357d1037548ceaa8789734d544646 Mon Sep 17 00:00:00 2001 From: HelloRusk Date: Thu, 17 Mar 2022 18:31:03 +0900 Subject: [PATCH 2/6] add test for the vocab size of the trained Unigram model --- bindings/python/tests/bindings/test_trainers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index bf290de9f..36fa67bd3 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -156,7 +156,14 @@ def test_can_pickle(self): class TestUnigram: def test_train(self, train_files): tokenizer = SentencePieceUnigramTokenizer() - tokenizer.train(train_files["small"], show_progress=False) + tokenizer.train( + train_files["small"], + show_progress=False, + special_tokens=["[PAD]", "[SEP]", "[CLS]"], + vocab_size=100 + ) + + assert tokenizer.get_vocab_size() == 100 filename = "tests/data/unigram_trained.json" tokenizer.save(filename) From 1015b6c9ce3e431dfb03f705666f8421120316b6 Mon Sep 17 00:00:00 2001 From: HelloRusk Date: Thu, 17 Mar 2022 18:49:23 +0900 Subject: [PATCH 3/6] Revert "add test for the vocab size of the trained Unigram model" This reverts commit fb8955c831b357d1037548ceaa8789734d544646. --- bindings/python/tests/bindings/test_trainers.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index 36fa67bd3..bf290de9f 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -156,14 +156,7 @@ def test_can_pickle(self): class TestUnigram: def test_train(self, train_files): tokenizer = SentencePieceUnigramTokenizer() - tokenizer.train( - train_files["small"], - show_progress=False, - special_tokens=["[PAD]", "[SEP]", "[CLS]"], - vocab_size=100 - ) - - assert tokenizer.get_vocab_size() == 100 + tokenizer.train(train_files["small"], show_progress=False) filename = "tests/data/unigram_trained.json" tokenizer.save(filename) From b28377f15390c79ec91d3fb7cdcf2cdc868d8e67 Mon Sep 17 00:00:00 2001 From: HelloRusk Date: Thu, 17 Mar 2022 19:02:41 +0900 Subject: [PATCH 4/6] Fixing the vocab size of the trained Unigram model --- .../python/tests/bindings/test_trainers.py | 22 ++++++++++++++ tokenizers/src/models/unigram/trainer.rs | 30 +++++++++++-------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index bf290de9f..512369690 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -237,6 +237,28 @@ def test_train_with_special_tokens(self): "t ", "[SEP]", ] + + tokenizer = Tokenizer(models.Unigram()) + trainer = trainers.UnigramTrainer( + show_progress=False, + special_tokens=["[PAD]", "[SEP]", "[CLS]"], + unk_token="[UNK]", + vocab_size=100 + ) + tokenizer.train([filename], trainer=trainer) + + assert tokenizer.get_vocab_size() == 100 + + tokenizer = Tokenizer(models.Unigram()) + trainer = trainers.UnigramTrainer( + show_progress=False, + special_tokens=["[PAD]", "[SEP]", "[CLS]", "[UNK]"], + unk_token="[UNK]", + vocab_size=100 + ) + tokenizer.train([filename], trainer=trainer) + + assert tokenizer.get_vocab_size() == 100 def test_cannot_train_different_model(self): tokenizer = Tokenizer(models.BPE()) diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs index f2d92c754..ffdc7ba12 100644 --- a/tokenizers/src/models/unigram/trainer.rs +++ b/tokenizers/src/models/unigram/trainer.rs @@ -126,19 +126,7 @@ impl UnigramTrainer { min_score_penalty += min_score_penalty_delta; } } - for (token, score) in model.iter() { - if inserted.contains::(token) { - continue; - } - inserted.insert(token.to_string()); - pieces.push((token.to_string(), if score.is_nan() { 0.0 } else { *score })); - if pieces.len() == self.vocab_size as usize - self.special_tokens.len() { - break; - } - } - pieces.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); - // Insert the necessary tokens let (unk_id, need_add_unk) = if let Some(ref unk) = self.unk_token { let unk_id = self.special_tokens.iter().enumerate().find_map(|(i, t)| { if t.content == *unk { @@ -154,6 +142,24 @@ impl UnigramTrainer { } else { (None, false) }; + + for (token, score) in model.iter() { + if inserted.contains::(token) { + continue; + } + inserted.insert(token.to_string()); + pieces.push((token.to_string(), if score.is_nan() { 0.0 } else { *score })); + + let vocab_size_without_special_tokens = + if need_add_unk { self.vocab_size as usize - self.special_tokens.len() - 1 } + else { self.vocab_size as usize - self.special_tokens.len() }; + if pieces.len() == vocab_size_without_special_tokens { + break; + } + } + pieces.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap()); + + // Insert the necessary tokens let mut special_tokens = self .special_tokens .iter() From 70042396327bb727f480017fc0944ff0e1f0c27a Mon Sep 17 00:00:00 2001 From: HelloRusk Date: Fri, 18 Mar 2022 15:26:53 +0900 Subject: [PATCH 5/6] format codes --- .../python/tests/bindings/test_trainers.py | 20 +++++++++---------- tokenizers/src/models/unigram/trainer.rs | 8 +++++--- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index 512369690..ab03caa73 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -237,27 +237,27 @@ def test_train_with_special_tokens(self): "t ", "[SEP]", ] - + tokenizer = Tokenizer(models.Unigram()) trainer = trainers.UnigramTrainer( - show_progress=False, - special_tokens=["[PAD]", "[SEP]", "[CLS]"], + show_progress=False, + special_tokens=["[PAD]", "[SEP]", "[CLS]"], unk_token="[UNK]", - vocab_size=100 + vocab_size=100, ) tokenizer.train([filename], trainer=trainer) - + assert tokenizer.get_vocab_size() == 100 - + tokenizer = Tokenizer(models.Unigram()) trainer = trainers.UnigramTrainer( - show_progress=False, - special_tokens=["[PAD]", "[SEP]", "[CLS]", "[UNK]"], + show_progress=False, + special_tokens=["[PAD]", "[SEP]", "[CLS]", "[UNK]"], unk_token="[UNK]", - vocab_size=100 + vocab_size=100, ) tokenizer.train([filename], trainer=trainer) - + assert tokenizer.get_vocab_size() == 100 def test_cannot_train_different_model(self): diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs index ffdc7ba12..2af52fa39 100644 --- a/tokenizers/src/models/unigram/trainer.rs +++ b/tokenizers/src/models/unigram/trainer.rs @@ -150,9 +150,11 @@ impl UnigramTrainer { inserted.insert(token.to_string()); pieces.push((token.to_string(), if score.is_nan() { 0.0 } else { *score })); - let vocab_size_without_special_tokens = - if need_add_unk { self.vocab_size as usize - self.special_tokens.len() - 1 } - else { self.vocab_size as usize - self.special_tokens.len() }; + let vocab_size_without_special_tokens = if need_add_unk { + self.vocab_size as usize - self.special_tokens.len() - 1 + } else { + self.vocab_size as usize - self.special_tokens.len() + }; if pieces.len() == vocab_size_without_special_tokens { break; } From f06220a572933c79e4c68f8eda3d6d22f35cbd31 Mon Sep 17 00:00:00 2001 From: HelloRusk Date: Fri, 18 Mar 2022 21:32:42 +0900 Subject: [PATCH 6/6] get the position of vocab-size calculation out of loop --- tokenizers/src/models/unigram/trainer.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs index 2af52fa39..dc5e536f6 100644 --- a/tokenizers/src/models/unigram/trainer.rs +++ b/tokenizers/src/models/unigram/trainer.rs @@ -143,6 +143,11 @@ impl UnigramTrainer { (None, false) }; + let vocab_size_without_special_tokens = if need_add_unk { + self.vocab_size as usize - self.special_tokens.len() - 1 + } else { + self.vocab_size as usize - self.special_tokens.len() + }; for (token, score) in model.iter() { if inserted.contains::(token) { continue; @@ -150,11 +155,6 @@ impl UnigramTrainer { inserted.insert(token.to_string()); pieces.push((token.to_string(), if score.is_nan() { 0.0 } else { *score })); - let vocab_size_without_special_tokens = if need_add_unk { - self.vocab_size as usize - self.special_tokens.len() - 1 - } else { - self.vocab_size as usize - self.special_tokens.len() - }; if pieces.len() == vocab_size_without_special_tokens { break; }