Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Wav2Vec2] Rename model's feature extractor to feature encoder #14959

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 32 additions & 11 deletions examples/pytorch/audio-classification/run_audio_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from random import randint
from typing import Optional
Expand Down Expand Up @@ -76,24 +77,24 @@ class DataTrainingArguments:
eval_file: Optional[str] = field(
default=None, metadata={"help": "A file containing the validation audio paths and labels."}
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to "
"'validation'"
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
label_column_name: Optional[str] = field(
label_column_name: str = field(
default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
)
max_train_samples: Optional[int] = field(
Expand All @@ -110,7 +111,7 @@ class DataTrainingArguments:
"value if set."
},
)
max_length_seconds: Optional[float] = field(
max_length_seconds: float = field(
default=20,
metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
)
Expand All @@ -136,11 +137,13 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
feature_extractor_name: Optional[str] = field(
default=None, metadata={"help": "Name or path of preprocessor config."}
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
attention_mask: Optional[bool] = field(
attention_mask: bool = field(
default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
)
use_auth_token: bool = field(
Expand All @@ -150,6 +153,24 @@ class ModelArguments:
"with private models)."
},
)
freeze_feature_extractor: Optional[bool] = field(
default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
)

def __post_init__(self):
if not self.freeze_feature_extractor and self.freeze_feature_encoder:
warnings.warn(
"The argument `--freeze_feature_extractor` is deprecated and "
"will be removed in a future version. Use `--freeze_feature_encoder`"
"instead. Setting `freeze_feature_encoder==True`.",
FutureWarning,
)
if self.freeze_feature_extractor and not self.freeze_feature_encoder:
raise ValueError(
"The argument `--freeze_feature_extractor` is deprecated and "
"should not be used in combination with `--freeze_feature_encoder`."
"Only make use of `--freeze_feature_encoder`."
)


def main():
Expand Down Expand Up @@ -302,8 +323,8 @@ def compute_metrics(eval_pred):
)

# freeze the convolutional waveform encoder
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()

if training_args.do_train:
if data_args.max_train_samples is not None:
Expand Down
8 changes: 4 additions & 4 deletions examples/pytorch/speech-recognition/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ python run_speech_recognition_ctc.py \
--eval_steps="100" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \
Expand Down Expand Up @@ -113,7 +113,7 @@ python -m torch.distributed.launch \
--logging_steps="1" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \
Expand Down Expand Up @@ -304,7 +304,7 @@ python run_speech_recognition_seq2seq.py \
--eval_steps="400" \
--logging_steps="10" \
--save_total_limit="1" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--fp16 \
--group_by_length \
Expand Down Expand Up @@ -346,7 +346,7 @@ python -m torch.distributed.launch \
--eval_steps="400" \
--logging_steps="10" \
--save_total_limit="1" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--fp16 \
--group_by_length \
Expand Down
56 changes: 27 additions & 29 deletions examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,52 +78,50 @@ class ModelArguments:
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
attention_dropout: Optional[float] = field(
attention_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
)
activation_dropout: Optional[float] = field(
activation_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
)
feat_proj_dropout: Optional[float] = field(
default=0.0, metadata={"help": "The dropout ratio for the projected features."}
)
hidden_dropout: Optional[float] = field(
feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
hidden_dropout: float = field(
default=0.0,
metadata={
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
},
)
final_dropout: Optional[float] = field(
final_dropout: float = field(
default=0.0,
metadata={"help": "The dropout probability for the final projection layer."},
)
mask_time_prob: Optional[float] = field(
mask_time_prob: float = field(
default=0.05,
metadata={
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis."
},
)
mask_time_length: Optional[int] = field(
mask_time_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the time axis."},
)
mask_feature_prob: Optional[float] = field(
mask_feature_prob: float = field(
default=0.0,
metadata={
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
},
)
mask_feature_length: Optional[int] = field(
mask_feature_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the feature axis."},
)
layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
ctc_loss_reduction: Optional[str] = field(
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
)
Expand All @@ -142,26 +140,26 @@ class DataTrainingArguments:
dataset_name: str = field(
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
dataset_config_name: str = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train+validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: Optional[str] = field(
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
Expand Down Expand Up @@ -190,20 +188,20 @@ class DataTrainingArguments:
default=None,
metadata={"help": "A list of characters to remove from the transcripts."},
)
eval_metrics: Optional[List[str]] = list_field(
eval_metrics: List[str] = list_field(
default=["wer"],
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
)
max_duration_in_seconds: Optional[float] = field(
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
preprocessing_only: Optional[bool] = field(
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
Expand All @@ -212,22 +210,22 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training"
},
)
use_auth_token: Optional[bool] = field(
use_auth_token: bool = field(
default=False,
metadata={
"help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
},
)
unk_token: Optional[str] = field(
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
)
pad_token: Optional[str] = field(
pad_token: str = field(
default="[PAD]",
metadata={"help": "The padding token for the tokenizer"},
)
word_delimiter_token: Optional[str] = field(
word_delimiter_token: str = field(
default="|",
metadata={"help": "The word delimiter token for the tokenizer"},
)
Expand Down Expand Up @@ -545,8 +543,8 @@ def remove_special_characters(batch):
)

# freeze encoder
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()

# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ class ModelArguments:
"with private models)."
},
)
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)


Expand All @@ -102,7 +102,7 @@ class DataTrainingArguments:
Arguments pertaining to what data we are going to input our model for training and eval.
"""

dataset_name: Optional[str] = field(
dataset_name: str = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
Expand Down Expand Up @@ -133,24 +133,24 @@ class DataTrainingArguments:
"value if set."
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: Optional[str] = field(
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
max_duration_in_seconds: Optional[float] = field(
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
preprocessing_only: Optional[bool] = field(
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
Expand All @@ -159,19 +159,19 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training"
},
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
do_lower_case: Optional[bool] = field(
do_lower_case: bool = field(
default=True,
metadata={"help": "Whether the target text should be lower cased."},
)
Expand Down Expand Up @@ -335,8 +335,8 @@ def main():
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()

# 6. Resample speech dataset if necassary
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
Expand Down
Loading