From 4fb92d3eefab975d81a9ac28518815f0b76b1ae0 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Wed, 30 Mar 2022 17:13:00 +0200 Subject: [PATCH] add code samples for TF speech models --- src/transformers/utils/doc.py | 63 +++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py index f395f8d4fb80..eaf59ba50215 100644 --- a/src/transformers/utils/doc.py +++ b/src/transformers/utils/doc.py @@ -794,6 +794,67 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None): ``` """ +TF_SPEECH_BASE_MODEL_SAMPLE = r""" + Example: + + ```python + >>> from transformers import {processor_class}, {model_class} + >>> from datasets import load_dataset + + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = dataset.sort("id") + >>> sampling_rate = dataset.features["audio"].sampling_rate + + >>> processor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + + >>> # audio file is decoded on the fly + >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="tf") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + {expected_output} + ``` +""" + +TF_SPEECH_CTC_SAMPLE = r""" + Example: + + ```python + >>> from transformers import {processor_class}, {model_class} + >>> from datasets import load_dataset + >>> import tensorflow as tf + + >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") + >>> dataset = dataset.sort("id") + >>> sampling_rate = dataset.features["audio"].sampling_rate + + >>> processor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + + >>> # audio file is decoded on the fly + >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="tf") + >>> logits = model(**inputs).logits + >>> predicted_ids = tf.math.argmax(logits, axis=-1) + + >>> # transcribe speech + >>> transcription = processor.batch_decode(predicted_ids) + >>> transcription[0] + {expected_output} + ``` + + ```python + >>> with processor.as_target_processor(): + ... inputs["labels"] = processor(dataset[0]["text"], return_tensors="tf").input_ids + + >>> # compute loss + >>> loss = model(**inputs).loss + >>> round(float(loss), 2) + {expected_loss} + ``` +""" + TF_VISION_BASE_MODEL_SAMPLE = r""" Example: @@ -848,6 +909,8 @@ def _prepare_output_docstrings(output_type, config_class, min_indent=None): "MaskedLM": TF_MASKED_LM_SAMPLE, "LMHead": TF_CAUSAL_LM_SAMPLE, "BaseModel": TF_BASE_MODEL_SAMPLE, + "SpeechBaseModel": TF_SPEECH_BASE_MODEL_SAMPLE, + "CTC": TF_SPEECH_CTC_SAMPLE, "VisionBaseModel": TF_VISION_BASE_MODEL_SAMPLE, "ImageClassification": TF_VISION_SEQ_CLASS_SAMPLE, }