Skip to content

Commit

Permalink
[Wav2Vec2] Rename model's feature extractor to feature encoder (huggi…
Browse files Browse the repository at this point in the history
…ngface#14959)

* rename classes

* clean up more namings

* remove bogus file

* Apply suggestions from code review

* Apply suggestions from code review

* replace more names

* more regex replace

* make style

* correct

* correct more

* make style

* finish

* correct more in wav2vec2

* make style

* improve freeze_extractor

* add aliases

* add tf aliases
  • Loading branch information
patrickvonplaten authored and Steven committed Jan 6, 2022
1 parent 60b5149 commit fcdedd8
Show file tree
Hide file tree
Showing 32 changed files with 658 additions and 215 deletions.
43 changes: 32 additions & 11 deletions examples/pytorch/audio-classification/run_audio_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from random import randint
from typing import Optional
Expand Down Expand Up @@ -76,24 +77,24 @@ class DataTrainingArguments:
eval_file: Optional[str] = field(
default=None, metadata={"help": "A file containing the validation audio paths and labels."}
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to "
"'validation'"
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
label_column_name: Optional[str] = field(
label_column_name: str = field(
default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
)
max_train_samples: Optional[int] = field(
Expand All @@ -110,7 +111,7 @@ class DataTrainingArguments:
"value if set."
},
)
max_length_seconds: Optional[float] = field(
max_length_seconds: float = field(
default=20,
metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
)
Expand All @@ -136,11 +137,13 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
feature_extractor_name: Optional[str] = field(
default=None, metadata={"help": "Name or path of preprocessor config."}
)
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
attention_mask: Optional[bool] = field(
attention_mask: bool = field(
default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
)
use_auth_token: bool = field(
Expand All @@ -150,6 +153,24 @@ class ModelArguments:
"with private models)."
},
)
freeze_feature_extractor: Optional[bool] = field(
default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
)

def __post_init__(self):
if not self.freeze_feature_extractor and self.freeze_feature_encoder:
warnings.warn(
"The argument `--freeze_feature_extractor` is deprecated and "
"will be removed in a future version. Use `--freeze_feature_encoder`"
"instead. Setting `freeze_feature_encoder==True`.",
FutureWarning,
)
if self.freeze_feature_extractor and not self.freeze_feature_encoder:
raise ValueError(
"The argument `--freeze_feature_extractor` is deprecated and "
"should not be used in combination with `--freeze_feature_encoder`."
"Only make use of `--freeze_feature_encoder`."
)


def main():
Expand Down Expand Up @@ -302,8 +323,8 @@ def compute_metrics(eval_pred):
)

# freeze the convolutional waveform encoder
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()

if training_args.do_train:
if data_args.max_train_samples is not None:
Expand Down
8 changes: 4 additions & 4 deletions examples/pytorch/speech-recognition/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ python run_speech_recognition_ctc.py \
--eval_steps="100" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \
Expand Down Expand Up @@ -113,7 +113,7 @@ python -m torch.distributed.launch \
--logging_steps="1" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
--fp16 \
Expand Down Expand Up @@ -304,7 +304,7 @@ python run_speech_recognition_seq2seq.py \
--eval_steps="400" \
--logging_steps="10" \
--save_total_limit="1" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--fp16 \
--group_by_length \
Expand Down Expand Up @@ -346,7 +346,7 @@ python -m torch.distributed.launch \
--eval_steps="400" \
--logging_steps="10" \
--save_total_limit="1" \
--freeze_feature_extractor \
--freeze_feature_encoder \
--gradient_checkpointing \
--fp16 \
--group_by_length \
Expand Down
56 changes: 27 additions & 29 deletions examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,52 +78,50 @@ class ModelArguments:
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)
attention_dropout: Optional[float] = field(
attention_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
)
activation_dropout: Optional[float] = field(
activation_dropout: float = field(
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
)
feat_proj_dropout: Optional[float] = field(
default=0.0, metadata={"help": "The dropout ratio for the projected features."}
)
hidden_dropout: Optional[float] = field(
feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
hidden_dropout: float = field(
default=0.0,
metadata={
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
},
)
final_dropout: Optional[float] = field(
final_dropout: float = field(
default=0.0,
metadata={"help": "The dropout probability for the final projection layer."},
)
mask_time_prob: Optional[float] = field(
mask_time_prob: float = field(
default=0.05,
metadata={
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis."
},
)
mask_time_length: Optional[int] = field(
mask_time_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the time axis."},
)
mask_feature_prob: Optional[float] = field(
mask_feature_prob: float = field(
default=0.0,
metadata={
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
},
)
mask_feature_length: Optional[int] = field(
mask_feature_length: int = field(
default=10,
metadata={"help": "Length of vector span to mask along the feature axis."},
)
layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
ctc_loss_reduction: Optional[str] = field(
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
)
Expand All @@ -142,26 +140,26 @@ class DataTrainingArguments:
dataset_name: str = field(
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
dataset_config_name: str = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train+validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: Optional[str] = field(
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
Expand Down Expand Up @@ -190,20 +188,20 @@ class DataTrainingArguments:
default=None,
metadata={"help": "A list of characters to remove from the transcripts."},
)
eval_metrics: Optional[List[str]] = list_field(
eval_metrics: List[str] = list_field(
default=["wer"],
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
)
max_duration_in_seconds: Optional[float] = field(
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
preprocessing_only: Optional[bool] = field(
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
Expand All @@ -212,22 +210,22 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training"
},
)
use_auth_token: Optional[bool] = field(
use_auth_token: bool = field(
default=False,
metadata={
"help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
},
)
unk_token: Optional[str] = field(
unk_token: str = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
)
pad_token: Optional[str] = field(
pad_token: str = field(
default="[PAD]",
metadata={"help": "The padding token for the tokenizer"},
)
word_delimiter_token: Optional[str] = field(
word_delimiter_token: str = field(
default="|",
metadata={"help": "The word delimiter token for the tokenizer"},
)
Expand Down Expand Up @@ -545,8 +543,8 @@ def remove_special_characters(batch):
)

# freeze encoder
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()

# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ class ModelArguments:
"with private models)."
},
)
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
freeze_feature_encoder: bool = field(
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
)


Expand All @@ -102,7 +102,7 @@ class DataTrainingArguments:
Arguments pertaining to what data we are going to input our model for training and eval.
"""

dataset_name: Optional[str] = field(
dataset_name: str = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
Expand Down Expand Up @@ -133,24 +133,24 @@ class DataTrainingArguments:
"value if set."
},
)
audio_column_name: Optional[str] = field(
audio_column_name: str = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: Optional[str] = field(
text_column_name: str = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
max_duration_in_seconds: Optional[float] = field(
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
min_duration_in_seconds: float = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
preprocessing_only: Optional[bool] = field(
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
Expand All @@ -159,19 +159,19 @@ class DataTrainingArguments:
"so that the cached datasets can consequently be loaded in distributed training"
},
)
train_split_name: Optional[str] = field(
train_split_name: str = field(
default="train",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
eval_split_name: str = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
do_lower_case: Optional[bool] = field(
do_lower_case: bool = field(
default=True,
metadata={"help": "Whether the target text should be lower cased."},
)
Expand Down Expand Up @@ -335,8 +335,8 @@ def main():
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
if model_args.freeze_feature_encoder:
model.freeze_feature_encoder()

# 6. Resample speech dataset if necassary
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
Expand Down
Loading

0 comments on commit fcdedd8

Please sign in to comment.