Skip to content

Commit

Permalink
Fixing tests for Perceiver (#14745)
Browse files Browse the repository at this point in the history
- Do not run image-classification pipeline (_CHECKPOINT_FOR_DOC uses the checkpoint for
langage, which cannot load a FeatureExtractor so current logic fails).
- Add a safeguard to not run tests when `tokenizer_class` or
`feature_extractor_class` **are** defined, but cannot be loaded
This happens for Perceiver for the "FastTokenizer" (which doesn't exist
so None) and FeatureExtractor (which does exist but cannot be loaded
because the checkpoint doesn't define one which is reasonable for the
said checkpoint)
- Added `get_vocab` function to `PerceiverTokenizer` since it is used by
`fill-mask` pipeline when the argument `targets` is used to narrow a
subset of possible values.

Co-authored-by: Nicolas Patry <[email protected]>
  • Loading branch information
LysandreJik and Narsil authored Dec 13, 2021
1 parent 4c99e55 commit 3d66146
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/transformers/models/auto/feature_extraction_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
("detr", "DetrFeatureExtractor"),
("layoutlmv2", "LayoutLMv2FeatureExtractor"),
("clip", "CLIPFeatureExtractor"),
("perceiver", "PerceiverFeatureExtractor"),
]
)

Expand Down
12 changes: 10 additions & 2 deletions src/transformers/models/perceiver/tokenization_perceiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def __init__(
self._utf_vocab_size = 2 ** 8 # utf is 8 bits

# define special tokens dict
self.special_tokens_encoder: Dict[int, str] = {
self.special_tokens_encoder: Dict[str, int] = {
self.pad_token: 0,
self.bos_token: 1,
self.eos_token: 2,
Expand All @@ -96,7 +96,15 @@ def __init__(
self.sep_token: 5,
}
self._num_special_tokens = len(self.special_tokens_encoder)
self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()}
self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()}

def get_vocab(self) -> Dict[str, int]:
vocab = self.special_tokens_encoder.copy()
vocab.update(self.added_tokens_encoder)
for i in range(self._utf_vocab_size):
token = chr(i)
vocab[token] = i + len(self.special_tokens_encoder)
return vocab

@property
def vocab_size(self):
Expand Down
6 changes: 6 additions & 0 deletions tests/test_pipelines_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ def test(self):
else:
tokenizer = None
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)

if tokenizer is None and feature_extractor is None:
self.skipTest(
f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor (PerceiverConfig with no FastTokenizer ?)"
)
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
if pipeline is None:
# The test can disable itself, but it should be very marginal
Expand Down Expand Up @@ -213,6 +218,7 @@ def data(n):
if not tokenizer_classes:
# We need to test even if there are no tokenizers.
tokenizer_classes = [None]

for tokenizer_class in tokenizer_classes:
if tokenizer_class is not None:
tokenizer_name = tokenizer_class.__name__
Expand Down
11 changes: 10 additions & 1 deletion tests/test_pipelines_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

import unittest

from transformers import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, PreTrainedTokenizer, is_vision_available
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
PerceiverConfig,
PreTrainedTokenizer,
is_vision_available,
)
from transformers.pipelines import ImageClassificationPipeline, pipeline
from transformers.testing_utils import (
is_pipeline_test,
Expand Down Expand Up @@ -45,6 +50,10 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING

def get_test_pipeline(self, model, tokenizer, feature_extractor):
if isinstance(model.config, PerceiverConfig):
self.skipTest(
"Perceiver model tester is defined with a language one, which has no feature_extractor, so the automated test cannot work here"
)

image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
examples = [
Expand Down

0 comments on commit 3d66146

Please sign in to comment.