diff --git a/docs/src/index.rst b/docs/src/index.rst index 93184f4ca..93a0ede07 100644 --- a/docs/src/index.rst +++ b/docs/src/index.rst @@ -37,6 +37,7 @@ auto_tutorials/code_4_parallelism tutorials/cluster tutorials/pytorch_a2c_ppo + tutorials/speechbrain_tutorial .. toctree:: :caption: Plugins diff --git a/docs/src/tutorials/speech-brain.rst b/docs/src/tutorials/speech-brain.rst new file mode 100644 index 000000000..cd42ef723 --- /dev/null +++ b/docs/src/tutorials/speech-brain.rst @@ -0,0 +1,112 @@ +******************** +SpeechBrain +******************** + +In this short tutorial, we're going to demonstrate how Oríon can be integrated to a `SpeechBrain +`_ speech recognition model. +The files mentioned in this tutorial are available in the `Oríon +`_ repository. + +Installation and setup +====================== + +Make sure Oríon is installed (:doc:`/install/core`). + +Then install SpeechBrain using ``$ pip install speechbrain`` + +Code used in this tutorial +========================== + +In this tutorial, we are going to use some code from the `SpeechBrain +` repository. More specifically, a speech recognition +template made as an example. We will repurpose this example to adapt it for Oríon. The template +used for creating this tutorial can be found `here +`. +You can also directly see the code modified for this example here : +``examples/speechbrain_tutorial``. + +We used the ``train.py`` file, but created a ``main.py``, with the ``main`` function, +which we slightly modified for optimizing the hyperparamers with Oríon. + +Adapting the Speechbrain for Oríon +================================== + +The Adaptation for using Oríon is quite simple. + +1) We first need to import ``orion.report_objective()`` into the project. + +.. code-block:: python + + from orion.client import report_objective + +2) We then need to change the evaluation from the training data to the validation data. +The evaluation method should look like this. It returns the validation loss. + +.. literalinclude:: /../../examples/speechbrain_tutorial/main.py + :language: python + :lines: 75-80 + +3) Finally, we call ``report_objective`` at the end to return the final objective value, +the validation loss, to Oríon. + +.. code-block:: python + + report_objective(valid_stats) + +The code is now adapted and ready to be used with Oríon. + +Execution +========= + +We are now going to call the ``orion hunt`` command. +Notice that we still need to give the ``train.yaml`` +file to speechbrain, since the general configuration is in there. However, we are going to specify +the hyperparameters that we want to optimize in the command line, +which will automatically overrides the ones set in the ``train.yaml``. When an argument +is defined both in the yaml configuration file and in command line, SpeechBrain +gives precedence to values provided in command line. Thus, defining the hyperparamers through +the command line for Oríon allows overriding the values in ``train.yaml`` in SpeechBrain. + +.. code-block:: bash + + orion hunt \ + --enable-evc -n \ + python main.py train.yaml \ + --lr~'loguniform(0.05, 0.2)' \ + --ctc_weight~'loguniform(0.25, 0.75)' \ + --label_smoothing~'loguniform(1e-10, 10e-5)' \ + --coverage_penalty~'loguniform(1.0, 2.0)' \ + --temperature~'loguniform(1.0, 1.5)' \ + --temperature_lm~'loguniform(1.0, 1.5)' + +Results +======= + +When an experiment reaches its termination criterion, basically ``max-trials``, +you can see the results using the following command: + +.. code-block:: bash + + $ orion info -n + +Which outputs the following statistics: + +.. code-block:: bash + + Stats + ===== + completed: True + trials completed: 209 + best trial: + id: 8675cfcfba768243e1ed1ac7825c69b6 + evaluation: 0.13801406680803444 + params: + /coverage_penalty: 1.396 + /ctc_weight: 0.389 + /label_smoothing: 2.044e-10 + /lr: 0.06462 + /temperature: 1.175 + /temperature_lm: 1.087 + start time: 2022-09-29 14:37:41.048314 + finish time: 2022-09-30 20:08:07.384765 + duration: 1 day, 5:30:26.336451 diff --git a/examples/speechbrain_tutorial/main.py b/examples/speechbrain_tutorial/main.py new file mode 100644 index 000000000..a21f54039 --- /dev/null +++ b/examples/speechbrain_tutorial/main.py @@ -0,0 +1,82 @@ +import logging +import sys + +import speechbrain as sb +import torch +from hyperpyyaml import load_hyperpyyaml +from mini_librispeech_prepare import prepare_mini_librispeech +from speechbrain.utils.distributed import run_on_main +from train import ASR, dataio_prepare + +from orion.client import report_objective + +logger = logging.getLogger(__name__) + +if __name__ == "__main__": + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Data preparation, to be run on only one process. + sb.utils.distributed.run_on_main( + prepare_mini_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_annotation"], + "save_json_valid": hparams["valid_annotation"], + "save_json_test": hparams["test_annotation"], + }, + ) + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # In this case, pre-training is essential because mini-librispeech is not + # big enough to train an end-to-end model from scratch. With bigger dataset + # you can train from scratch and avoid this step. + # We download the pretrained LM from HuggingFace (or elsewhere depending on + # the path given in the YAML file). The tokenizer is loaded at the same time. + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected(device=torch.device("cpu")) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + asr_brain.fit( + asr_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + valid_stats = asr_brain.evaluate( + test_set=datasets["valid"], + min_key="WER", + test_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + report_objective(valid_stats) diff --git a/examples/speechbrain_tutorial/train.py b/examples/speechbrain_tutorial/train.py new file mode 100644 index 000000000..ad58f51ac --- /dev/null +++ b/examples/speechbrain_tutorial/train.py @@ -0,0 +1,461 @@ +#!/usr/bin/env/python3 +# Code from The SpeechBrain Github repository : +# https://github.com/speechbrain/speechbrain/blob/develop/templates/speech_recognition/ASR/train.py +"""Recipe for training a sequence-to-sequence ASR system with mini-librispeech. +The system employs an encoder, a decoder, and an attention mechanism +between them. Decoding is performed with beam search coupled with a neural +language model. + +To run this recipe, do the following: +> python train.py train.yaml + +With the default hyperparameters, the system employs an LSTM encoder. +The decoder is based on a standard GRU. Beam search coupled with an RNN language +model is used on the top of decoder probabilities. + +The neural network is trained on both CTC and negative-log likelihood +targets and sub-word units estimated with Byte Pairwise Encoding (BPE) +are used as basic recognition tokens. Training is performed on the mini-librispeech +dataset. Note that this is a tiny dataset used here just to +provide a working example. To achieve a better performance you have to train with +larger datasets, such as the full LibriSpeech one. In this case, to allow the +model to converge, we pre-train it with a bigger one (trained on the full librispeech +with the seq2seq 1k BPE recipe). + +The experiment file is flexible enough to support a large variety of +different systems. By properly changing the parameter files, you can try +different encoders, decoders, tokens (e.g, characters instead of BPE). + +This recipe assumes that the tokenizer and the LM are already trained. +To avoid token mismatches, the tokenizer used for the acoustic model is +the same use for the LM. The recipe downloads the pre-trained tokenizer +and LM. + +If you would like to train a full system from scratch do the following: +1- Train a tokenizer (see ../Tokenizer) +2- Train a language model (see ../LM) +3- Train the speech recognizer (with this code). + + +Authors + * Mirco Ravanelli 2020 + * Ju-Chieh Chou 2020 + * Abdel Heba 2020 + * Peter Plantinga 2020 + * Samuele Cornell 2020 +""" + +import logging +import sys + +import speechbrain as sb +import torch +from hyperpyyaml import load_hyperpyyaml +from mini_librispeech_prepare import prepare_mini_librispeech +from speechbrain.utils.distributed import run_on_main + +logger = logging.getLogger(__name__) + + +# Brain class for speech recognition training +class ASR(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def compute_forward(self, batch, stage): + """Runs all the computation of the CTC + seq2seq ASR. It returns the + posterior probabilities of the CTC and seq2seq networks. + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + At training time it returns predicted seq2seq log probabilities. + If needed it also returns the ctc output log probabilities. + At validation/test time, it returns the predicted tokens as well. + """ + # We first move the batch to the appropriate device. + batch = batch.to(self.device) + feats, self.feat_lens = self.prepare_features(stage, batch.sig) + tokens_bos, _ = self.prepare_tokens(stage, batch.tokens_bos) + + # Running the encoder (prevent propagation to feature extraction) + encoded_signal = self.modules.encoder(feats.detach()) + + # Embed tokens and pass tokens & encoded signal to decoder + embedded_tokens = self.modules.embedding(tokens_bos) + decoder_outputs, _ = self.modules.decoder( + embedded_tokens, encoded_signal, self.feat_lens + ) + + # Output layer for seq2seq log-probabilities + logits = self.modules.seq_lin(decoder_outputs) + predictions = {"seq_logprobs": self.hparams.log_softmax(logits)} + + if self.is_ctc_active(stage): + # Output layer for ctc log-probabilities + ctc_logits = self.modules.ctc_lin(encoded_signal) + predictions["ctc_logprobs"] = self.hparams.log_softmax(ctc_logits) + elif stage == sb.Stage.VALID: + predictions["tokens"], _ = self.hparams.valid_search( + encoded_signal, self.feat_lens + ) + elif stage == sb.Stage.TEST: + predictions["tokens"], _ = self.hparams.test_search( + encoded_signal, self.feat_lens + ) + + return predictions + + def is_ctc_active(self, stage): + """Check if CTC is currently active. + + Arguments + --------- + stage : sb.Stage + Currently executing stage. + """ + if stage != sb.Stage.TRAIN: + return False + current_epoch = self.hparams.epoch_counter.current + return current_epoch <= self.hparams.number_of_ctc_epochs + + def prepare_features(self, stage, wavs): + """Prepare features for computation on-the-fly + + Arguments + --------- + stage : sb.Stage + Currently executing stage. + wavs : tuple + The input signals (tensor) and their lengths (tensor). + """ + wavs, wav_lens = wavs + + # Add augmentation if specified. In this version of augmentation, we + # concatenate the original and the augment batches in a single bigger + # batch. This is more memory-demanding, but helps to improve the + # performance. Change it if you run OOM. + if stage == sb.Stage.TRAIN: + if hasattr(self.modules, "env_corrupt"): + wavs_noise = self.modules.env_corrupt(wavs, wav_lens) + wavs = torch.cat([wavs, wavs_noise], dim=0) + wav_lens = torch.cat([wav_lens, wav_lens]) + + if hasattr(self.hparams, "augmentation"): + wavs = self.hparams.augmentation(wavs, wav_lens) + + # Feature computation and normalization + feats = self.hparams.compute_features(wavs) + feats = self.modules.normalize(feats, wav_lens) + + return feats, wav_lens + + def prepare_tokens(self, stage, tokens): + """Double the tokens batch if features are doubled. + + Arguments + --------- + stage : sb.Stage + Currently executing stage. + tokens : tuple + The tokens (tensor) and their lengths (tensor). + """ + tokens, token_lens = tokens + if hasattr(self.modules, "env_corrupt") and stage == sb.Stage.TRAIN: + tokens = torch.cat([tokens, tokens], dim=0) + token_lens = torch.cat([token_lens, token_lens], dim=0) + return tokens, token_lens + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + # Compute sequence loss against targets with EOS + tokens_eos, tokens_eos_lens = self.prepare_tokens(stage, batch.tokens_eos) + loss = sb.nnet.losses.nll_loss( + log_probabilities=predictions["seq_logprobs"], + targets=tokens_eos, + length=tokens_eos_lens, + label_smoothing=self.hparams.label_smoothing, + ) + + # Add ctc loss if necessary. The total cost is a weighted sum of + # ctc loss + seq2seq loss + if self.is_ctc_active(stage): + # Load tokens without EOS as CTC targets + tokens, tokens_lens = self.prepare_tokens(stage, batch.tokens) + loss_ctc = self.hparams.ctc_cost( + predictions["ctc_logprobs"], tokens, self.feat_lens, tokens_lens + ) + loss *= 1 - self.hparams.ctc_weight + loss += self.hparams.ctc_weight * loss_ctc + + if stage != sb.Stage.TRAIN: + # Converted predicted tokens from indexes to words + predicted_words = [ + self.hparams.tokenizer.decode_ids(prediction).split(" ") + for prediction in predictions["tokens"] + ] + target_words = [words.split(" ") for words in batch.words] + + # Monitor word error rate and character error rated at + # valid and test time. + self.wer_metric.append(batch.id, predicted_words, target_words) + self.cer_metric.append(batch.id, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + # Set up statistics trackers for this stage + # In this case, we would like to keep track of the word error rate (wer) + # and the character error rate (cer) + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.error_rate_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # Summarize the statistics from the stage for record-keeping. + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + # Update learning rate + old_lr, new_lr = self.hparams.lr_annealing(stage_stats["WER"]) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": old_lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"]}, + min_keys=["WER"], + ) + + # We also write statistics about test data to stdout and to the logfile. + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + with open(self.hparams.wer_file, "w") as w: + self.wer_metric.write_stats(w) + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + """ + # Define audio pipeline. In this case, we simply read the path contained + # in the variable wav with the audio reader. + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + """Load the audio signal. This is done on the CPU in the `collate_fn`.""" + sig = sb.dataio.dataio.read_audio(wav) + return sig + + # Define text processing pipeline. We start from the raw text and then + # encode it using the tokenizer. The tokens with BOS are used for feeding + # decoder during training, the tokens with EOS for computing the cost function. + # The tokens without BOS or EOS is for computing CTC loss. + @sb.utils.data_pipeline.takes("words") + @sb.utils.data_pipeline.provides( + "words", "tokens_list", "tokens_bos", "tokens_eos", "tokens" + ) + def text_pipeline(words): + """Processes the transcriptions to generate proper labels""" + yield words + tokens_list = hparams["tokenizer"].encode_as_ids(words) + yield tokens_list + tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list)) + yield tokens_bos + tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]]) + yield tokens_eos + tokens = torch.LongTensor(tokens_list) + yield tokens + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_annotation"], + "valid": hparams["valid_annotation"], + "test": hparams["test_annotation"], + } + + for dataset in data_info: + datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=[audio_pipeline, text_pipeline], + output_keys=[ + "id", + "sig", + "words", + "tokens_bos", + "tokens_eos", + "tokens", + ], + ) + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + + else: + raise NotImplementedError("sorting must be random, ascending or descending") + return datasets + + +if __name__ == "__main__": + + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Data preparation, to be run on only one process. + sb.utils.distributed.run_on_main( + prepare_mini_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_annotation"], + "save_json_valid": hparams["valid_annotation"], + "save_json_test": hparams["test_annotation"], + }, + ) + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # In this case, pre-training is essential because mini-librispeech is not + # big enough to train an end-to-end model from scratch. With bigger dataset + # you can train from scratch and avoid this step. + # We download the pretrained LM from HuggingFace (or elsewhere depending on + # the path given in the YAML file). The tokenizer is loaded at the same time. + run_on_main(hparams["pretrainer"].collect_files) + hparams["pretrainer"].load_collected(device=torch.device("cpu")) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + asr_brain.fit( + asr_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + test_stats = asr_brain.evaluate( + test_set=datasets["test"], + min_key="WER", + test_loader_kwargs=hparams["test_dataloader_opts"], + ) diff --git a/examples/speechbrain_tutorial/train.yaml b/examples/speechbrain_tutorial/train.yaml new file mode 100644 index 000000000..ea0099026 --- /dev/null +++ b/examples/speechbrain_tutorial/train.yaml @@ -0,0 +1,334 @@ +# ############################################################################ +# Model: E2E ASR with attention-based ASR +# Encoder: CRDNN +# Decoder: GRU + beamsearch + RNNLM +# Tokens: 1000 BPE +# losses: CTC+ NLL +# Training: mini-librispeech +# Pre-Training: librispeech 960h +# Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga, Samuele Cornell 2020 +# # ############################################################################ + +# Seed needs to be set at top of yaml, before objects with parameters are instantiated +seed: 2602 +__set_seed: !apply:torch.manual_seed [!ref ] + +# If you plan to train a system on an HPC cluster with a big dataset, +# we strongly suggest doing the following: +# 1- Compress the dataset in a single tar or zip file. +# 2- Copy your dataset locally (i.e., the local disk of the computing node). +# 3- Uncompress the dataset in the local folder. +# 4- Set data_folder with the local path +# Reading data from the local disk of the compute node (e.g. $SLURM_TMPDIR with SLURM-based clusters) is very important. +# It allows you to read the data much faster without slowing down the shared filesystem. + +data_folder: ../data # In this case, data will be automatically downloaded here. +data_folder_rirs: !ref # noise/ris dataset will automatically be downloaded here +output_folder: !ref results/CRDNN_BPE_960h_LM/ +wer_file: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Language model (LM) pretraining +# NB: To avoid mismatch, the speech recognizer must be trained with the same +# tokenizer used for LM training. Here, we download everything from the +# speechbrain HuggingFace repository. However, a local path pointing to a +# directory containing the lm.ckpt and tokenizer.ckpt may also be specified +# instead. E.g if you want to use your own LM / tokenizer. +pretrained_path: speechbrain/asr-crdnn-rnnlm-librispeech + + +# Path where data manifest files will be stored. The data manifest files are created by the +# data preparation script +train_annotation: ../train.json +valid_annotation: ../valid.json +test_annotation: ../test.json + +# The train logger writes training statistics to a file, as well as stdout. +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +# Training parameters +number_of_epochs: 15 +number_of_ctc_epochs: 5 +batch_size: 8 +lr: 0.1 +ctc_weight: 0.5 +sorting: ascending +ckpt_interval_minutes: 15 # save checkpoint every N min +label_smoothing: 0.1 + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +# Feature parameters +sample_rate: 16000 +n_fft: 400 +n_mels: 40 + +# Model parameters +activation: !name:torch.nn.LeakyReLU +dropout: 0.15 # 0.0 - 0.3 +cnn_blocks: 2 +cnn_channels: (128, 256) +inter_layer_pooling_size: (2, 2) +cnn_kernelsize: (3, 3) +time_pooling_size: 4 +rnn_class: !name:speechbrain.nnet.RNN.LSTM +rnn_layers: 4 +rnn_neurons: 1024 +rnn_bidirectional: True +dnn_blocks: 2 +dnn_neurons: 512 +emb_size: 128 +dec_neurons: 1024 +output_neurons: 1000 # Number of tokens (same as LM) +blank_index: 0 +bos_index: 0 +eos_index: 0 + +# Decoding parameters +min_decode_ratio: 0.0 +max_decode_ratio: 1.0 +valid_beam_size: 8 +test_beam_size: 80 +eos_threshold: 1.5 +using_max_attn_shift: True +max_attn_shift: 240 +lm_weight: 0.50 +ctc_weight_decode: 0.0 +coverage_penalty: 1.5 +temperature: 1.25 +temperature_lm: 1.25 + +# The first object passed to the Brain class is this "Epoch Counter" +# which is saved by the Checkpointer so that training can be resumed +# if it gets interrupted at any point. +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +# Feature extraction +compute_features: !new:speechbrain.lobes.features.Fbank + sample_rate: !ref + n_fft: !ref + n_mels: !ref + +# Feature normalization (mean and std) +normalize: !new:speechbrain.processing.features.InputNormalization + norm_type: global + +# Added noise and reverb come from OpenRIR dataset, automatically +# downloaded and prepared with this Environmental Corruption class. +env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt + openrir_folder: !ref + babble_prob: 0.0 + reverb_prob: 0.0 + noise_prob: 1.0 + noise_snr_low: 0 + noise_snr_high: 15 + +# Adds speech change + time and frequency dropouts (time-domain implementation). +augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment + sample_rate: !ref + speeds: [95, 100, 105] + +# The CRDNN model is an encoder that combines CNNs, RNNs, and DNNs. +encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN + input_shape: [null, null, !ref ] + activation: !ref + dropout: !ref + cnn_blocks: !ref + cnn_channels: !ref + cnn_kernelsize: !ref + inter_layer_pooling_size: !ref + time_pooling: True + using_2d_pooling: False + time_pooling_size: !ref + rnn_class: !ref + rnn_layers: !ref + rnn_neurons: !ref + rnn_bidirectional: !ref + rnn_re_init: True + dnn_blocks: !ref + dnn_neurons: !ref + use_rnnp: False + +# Embedding (from indexes to an embedding space of dimension emb_size). +embedding: !new:speechbrain.nnet.embedding.Embedding + num_embeddings: !ref + embedding_dim: !ref + +# Attention-based RNN decoder. +decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder + enc_dim: !ref + input_size: !ref + rnn_type: gru + attn_type: location + hidden_size: !ref + attn_dim: 1024 + num_layers: 1 + scaling: 1.0 + channels: 10 + kernel_size: 100 + re_init: True + dropout: !ref + +# Linear transformation on the top of the encoder. +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +# Linear transformation on the top of the decoder. +seq_lin: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +# Final softmax (for log posteriors computation). +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +# Cost definition for the CTC part. +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +# Tokenizer initialization +tokenizer: !new:sentencepiece.SentencePieceProcessor + +# Objects in "modules" dict will have their parameters moved to the correct +# device, as well as having train()/eval() called on them by the Brain class +modules: + encoder: !ref + embedding: !ref + decoder: !ref + ctc_lin: !ref + seq_lin: !ref + normalize: !ref + env_corrupt: !ref + lm_model: !ref + +# Gathering all the submodels in a single model object. +model: !new:torch.nn.ModuleList + - - !ref + - !ref + - !ref + - !ref + - !ref + +# This is the RNNLM that is used according to the Huggingface repository +# NB: It has to match the pre-trained RNNLM!! +lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM + output_neurons: !ref + embedding_dim: !ref + activation: !name:torch.nn.LeakyReLU + dropout: 0.0 + rnn_layers: 2 + rnn_neurons: 2048 + dnn_blocks: 1 + dnn_neurons: 512 + return_hidden: True # For inference + +# Beamsearch is applied on the top of the decoder. If the language model is +# given, a language model is applied (with a weight specified in lm_weight). +# If ctc_weight is set, the decoder uses CTC + attention beamsearch. This +# improves the performance, but slows down decoding. For a description of +# the other parameters, please see the speechbrain.decoders.S2SRNNBeamSearchLM. + +# It makes sense to have a lighter search during validation. In this case, +# we don't use the LM and CTC probabilities during decoding. +valid_search: !new:speechbrain.decoders.S2SRNNBeamSearcher + embedding: !ref + decoder: !ref + linear: !ref + ctc_linear: !ref + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + eos_threshold: !ref + using_max_attn_shift: !ref + max_attn_shift: !ref + coverage_penalty: !ref + temperature: !ref + +# The final decoding on the test set can be more computationally demanding. +# In this case, we use the LM + CTC probabilities during decoding as well. +# Please, remove this part if you need a faster decoder. +test_search: !new:speechbrain.decoders.S2SRNNBeamSearchLM + embedding: !ref + decoder: !ref + linear: !ref + ctc_linear: !ref + language_model: !ref + bos_index: !ref + eos_index: !ref + blank_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + eos_threshold: !ref + using_max_attn_shift: !ref + max_attn_shift: !ref + coverage_penalty: !ref + lm_weight: !ref + ctc_weight: !ref + temperature: !ref + temperature_lm: !ref + +# This function manages learning rate annealing over the epochs. +# We here use the NewBoB algorithm, that anneals the learning rate if +# the improvements over two consecutive epochs is less than the defined +# threshold. +lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +# This optimizer will be constructed by the Brain class after all parameters +# are moved to the correct device. Then it will be added to the checkpointer. +opt_class: !name:torch.optim.Adadelta + lr: !ref + rho: 0.95 + eps: 1.e-8 + +# Functions that compute the statistics to track during the validation step. +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True + +# This object is used for saving the state of training both so that it +# can be resumed if it gets interrupted, and also so that the best checkpoint +# can be later loaded for evaluation or inference. +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + normalizer: !ref + counter: !ref + +# This object is used to pretrain the language model and the tokenizers +# (defined above). In this case, we also pretrain the ASR model (to make +# sure the model converges on a small amount of data) +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + collect_in: !ref + loadables: + lm: !ref + tokenizer: !ref + model: !ref + paths: + lm: !ref /lm.ckpt + tokenizer: !ref /tokenizer.ckpt + model: !ref /asr.ckpt