From 60b5149d17a04e04946564820234df5c94a7776f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 28 Dec 2021 17:02:50 +0100 Subject: [PATCH] [Tests] Speed up tokenizer tests (#14964) * speed up canine and mluke * speed up mbart and mbart50 toks * upload files --- tests/test_tokenization_canine.py | 7 ++- tests/test_tokenization_layoutxlm.py | 72 ++++++++++++++++++++++++ tests/test_tokenization_mbart.py | 73 ++++++++++++++++++++++++ tests/test_tokenization_mbart50.py | 77 +++++++++++++++++++++++++- tests/test_tokenization_mluke.py | 3 +- tests/test_tokenization_xlm_roberta.py | 72 ++++++++++++++++++++++++ 6 files changed, 297 insertions(+), 7 deletions(-) diff --git a/tests/test_tokenization_canine.py b/tests/test_tokenization_canine.py index e6b786ccc902..d52cf82be760 100644 --- a/tests/test_tokenization_canine.py +++ b/tests/test_tokenization_canine.py @@ -39,11 +39,12 @@ def setUp(self): @cached_property def canine_tokenizer(self): - # TODO replace nielsr by google - return CanineTokenizer.from_pretrained("nielsr/canine-s") + return CanineTokenizer.from_pretrained("google/canine-s") def get_tokenizer(self, **kwargs) -> CanineTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + tokenizer._unicode_vocab_size = 1024 + return tokenizer @require_torch def test_prepare_batch_integration(self): diff --git a/tests/test_tokenization_layoutxlm.py b/tests/test_tokenization_layoutxlm.py index 025a0ef29ad1..f2bf284ffeae 100644 --- a/tests/test_tokenization_layoutxlm.py +++ b/tests/test_tokenization_layoutxlm.py @@ -1642,6 +1642,78 @@ def test_batch_encode_dynamic_overflowing(self): self.assertEqual(len(tokens[key].shape), 3) self.assertEqual(tokens[key].shape[-1], 4) + # overwrite from test_tokenization_common to speed up test + def test_save_pretrained(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {}) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + @unittest.skip("TO DO: overwrite this very extensive test.") def test_alignement_methods(self): pass diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py index 02a87d285cea..c2c50c95322a 100644 --- a/tests/test_tokenization_mbart.py +++ b/tests/test_tokenization_mbart.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import shutil import tempfile import unittest @@ -122,6 +123,78 @@ def test_full_tokenizer(self): ], ) + # overwrite from test_tokenization_common to speed up test + def test_save_pretrained(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {}) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + @require_torch @require_sentencepiece diff --git a/tests/test_tokenization_mbart50.py b/tests/test_tokenization_mbart50.py index 1327822f0352..4f076e2f0911 100644 --- a/tests/test_tokenization_mbart50.py +++ b/tests/test_tokenization_mbart50.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import shutil import tempfile import unittest @@ -34,7 +35,7 @@ @require_sentencepiece @require_tokenizers -class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): +class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MBart50Tokenizer rust_tokenizer_class = MBart50TokenizerFast test_rust_tokenizer = True @@ -113,11 +114,83 @@ def test_tokenizer_integration(self): revision="d3913889c59cd5c9e456b269c376325eabad57e2", ) + # overwrite from test_tokenization_common to speed up test + def test_save_pretrained(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {}) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + @require_torch @require_sentencepiece @require_tokenizers -class MBartOneToManyIntegrationTest(unittest.TestCase): +class MBart50OneToManyIntegrationTest(unittest.TestCase): checkpoint_name = "facebook/mbart-large-50-one-to-many-mmt" src_text = [ " UN Chief Says There Is No Military Solution in Syria", diff --git a/tests/test_tokenization_mluke.py b/tests/test_tokenization_mluke.py index fce11a07b734..9652e44aa910 100644 --- a/tests/test_tokenization_mluke.py +++ b/tests/test_tokenization_mluke.py @@ -70,9 +70,8 @@ def mluke_dict_integration_testing(self): [35378, 8999, 38, 33273, 11676, 604, 365, 21392, 201, 1819], ) - @slow def test_sequence_builders(self): - tokenizer = self.tokenizer_class.from_pretrained("studio-ousia/mluke-base") + tokenizer = self.tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-mluke") text = tokenizer.encode("sequence builders", add_special_tokens=False) text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py index d25782ae196c..f7bff7848366 100644 --- a/tests/test_tokenization_xlm_roberta.py +++ b/tests/test_tokenization_xlm_roberta.py @@ -140,6 +140,78 @@ def test_full_tokenizer(self): ], ) + # overwrite from test_tokenization_common to speed up test + def test_save_pretrained(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + return + + self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {}) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + @cached_property def big_tokenizer(self): return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")