Skip to content

Commit

Permalink
[Tests] Speed up tokenizer tests (huggingface#14964)
Browse files Browse the repository at this point in the history
* speed up canine and mluke

* speed up mbart and mbart50 toks

* upload files
  • Loading branch information
patrickvonplaten authored and Steven committed Jan 6, 2022
1 parent c3672f1 commit 60b5149
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 7 deletions.
7 changes: 4 additions & 3 deletions tests/test_tokenization_canine.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ def setUp(self):

@cached_property
def canine_tokenizer(self):
# TODO replace nielsr by google
return CanineTokenizer.from_pretrained("nielsr/canine-s")
return CanineTokenizer.from_pretrained("google/canine-s")

def get_tokenizer(self, **kwargs) -> CanineTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
tokenizer._unicode_vocab_size = 1024
return tokenizer

@require_torch
def test_prepare_batch_integration(self):
Expand Down
72 changes: 72 additions & 0 deletions tests/test_tokenization_layoutxlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1642,6 +1642,78 @@ def test_batch_encode_dynamic_overflowing(self):
self.assertEqual(len(tokens[key].shape), 3)
self.assertEqual(tokens[key].shape[-1], 4)

# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return

self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)

tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)

@unittest.skip("TO DO: overwrite this very extensive test.")
def test_alignement_methods(self):
pass
Expand Down
73 changes: 73 additions & 0 deletions tests/test_tokenization_mbart.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import os
import shutil
import tempfile
import unittest

Expand Down Expand Up @@ -122,6 +123,78 @@ def test_full_tokenizer(self):
],
)

# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return

self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)

tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)


@require_torch
@require_sentencepiece
Expand Down
77 changes: 75 additions & 2 deletions tests/test_tokenization_mbart50.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import os
import shutil
import tempfile
import unittest

Expand All @@ -34,7 +35,7 @@

@require_sentencepiece
@require_tokenizers
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBart50Tokenizer
rust_tokenizer_class = MBart50TokenizerFast
test_rust_tokenizer = True
Expand Down Expand Up @@ -113,11 +114,83 @@ def test_tokenizer_integration(self):
revision="d3913889c59cd5c9e456b269c376325eabad57e2",
)

# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return

self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)

tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)


@require_torch
@require_sentencepiece
@require_tokenizers
class MBartOneToManyIntegrationTest(unittest.TestCase):
class MBart50OneToManyIntegrationTest(unittest.TestCase):
checkpoint_name = "facebook/mbart-large-50-one-to-many-mmt"
src_text = [
" UN Chief Says There Is No Military Solution in Syria",
Expand Down
3 changes: 1 addition & 2 deletions tests/test_tokenization_mluke.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,8 @@ def mluke_dict_integration_testing(self):
[35378, 8999, 38, 33273, 11676, 604, 365, 21392, 201, 1819],
)

@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("studio-ousia/mluke-base")
tokenizer = self.tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-mluke")

text = tokenizer.encode("sequence builders", add_special_tokens=False)
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
Expand Down
72 changes: 72 additions & 0 deletions tests/test_tokenization_xlm_roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,78 @@ def test_full_tokenizer(self):
],
)

# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return

self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)

tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)

# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()

tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)

# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))

# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)

# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))

shutil.rmtree(tmpdirname2)

@cached_property
def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
Expand Down

0 comments on commit 60b5149

Please sign in to comment.