diff --git a/docs/src/index.rst b/docs/src/index.rst
index 93184f4ca..93a0ede07 100644
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -37,6 +37,7 @@
auto_tutorials/code_4_parallelism
tutorials/cluster
tutorials/pytorch_a2c_ppo
+ tutorials/speechbrain_tutorial
.. toctree::
:caption: Plugins
diff --git a/docs/src/tutorials/speech-brain.rst b/docs/src/tutorials/speech-brain.rst
new file mode 100644
index 000000000..cd42ef723
--- /dev/null
+++ b/docs/src/tutorials/speech-brain.rst
@@ -0,0 +1,112 @@
+********************
+SpeechBrain
+********************
+
+In this short tutorial, we're going to demonstrate how Oríon can be integrated to a `SpeechBrain
+`_ speech recognition model.
+The files mentioned in this tutorial are available in the `Oríon
+`_ repository.
+
+Installation and setup
+======================
+
+Make sure Oríon is installed (:doc:`/install/core`).
+
+Then install SpeechBrain using ``$ pip install speechbrain``
+
+Code used in this tutorial
+==========================
+
+In this tutorial, we are going to use some code from the `SpeechBrain
+` repository. More specifically, a speech recognition
+template made as an example. We will repurpose this example to adapt it for Oríon. The template
+used for creating this tutorial can be found `here
+`.
+You can also directly see the code modified for this example here :
+``examples/speechbrain_tutorial``.
+
+We used the ``train.py`` file, but created a ``main.py``, with the ``main`` function,
+which we slightly modified for optimizing the hyperparamers with Oríon.
+
+Adapting the Speechbrain for Oríon
+==================================
+
+The Adaptation for using Oríon is quite simple.
+
+1) We first need to import ``orion.report_objective()`` into the project.
+
+.. code-block:: python
+
+ from orion.client import report_objective
+
+2) We then need to change the evaluation from the training data to the validation data.
+The evaluation method should look like this. It returns the validation loss.
+
+.. literalinclude:: /../../examples/speechbrain_tutorial/main.py
+ :language: python
+ :lines: 75-80
+
+3) Finally, we call ``report_objective`` at the end to return the final objective value,
+the validation loss, to Oríon.
+
+.. code-block:: python
+
+ report_objective(valid_stats)
+
+The code is now adapted and ready to be used with Oríon.
+
+Execution
+=========
+
+We are now going to call the ``orion hunt`` command.
+Notice that we still need to give the ``train.yaml``
+file to speechbrain, since the general configuration is in there. However, we are going to specify
+the hyperparameters that we want to optimize in the command line,
+which will automatically overrides the ones set in the ``train.yaml``. When an argument
+is defined both in the yaml configuration file and in command line, SpeechBrain
+gives precedence to values provided in command line. Thus, defining the hyperparamers through
+the command line for Oríon allows overriding the values in ``train.yaml`` in SpeechBrain.
+
+.. code-block:: bash
+
+ orion hunt \
+ --enable-evc -n \
+ python main.py train.yaml \
+ --lr~'loguniform(0.05, 0.2)' \
+ --ctc_weight~'loguniform(0.25, 0.75)' \
+ --label_smoothing~'loguniform(1e-10, 10e-5)' \
+ --coverage_penalty~'loguniform(1.0, 2.0)' \
+ --temperature~'loguniform(1.0, 1.5)' \
+ --temperature_lm~'loguniform(1.0, 1.5)'
+
+Results
+=======
+
+When an experiment reaches its termination criterion, basically ``max-trials``,
+you can see the results using the following command:
+
+.. code-block:: bash
+
+ $ orion info -n
+
+Which outputs the following statistics:
+
+.. code-block:: bash
+
+ Stats
+ =====
+ completed: True
+ trials completed: 209
+ best trial:
+ id: 8675cfcfba768243e1ed1ac7825c69b6
+ evaluation: 0.13801406680803444
+ params:
+ /coverage_penalty: 1.396
+ /ctc_weight: 0.389
+ /label_smoothing: 2.044e-10
+ /lr: 0.06462
+ /temperature: 1.175
+ /temperature_lm: 1.087
+ start time: 2022-09-29 14:37:41.048314
+ finish time: 2022-09-30 20:08:07.384765
+ duration: 1 day, 5:30:26.336451
diff --git a/examples/speechbrain_tutorial/main.py b/examples/speechbrain_tutorial/main.py
new file mode 100644
index 000000000..a21f54039
--- /dev/null
+++ b/examples/speechbrain_tutorial/main.py
@@ -0,0 +1,82 @@
+import logging
+import sys
+
+import speechbrain as sb
+import torch
+from hyperpyyaml import load_hyperpyyaml
+from mini_librispeech_prepare import prepare_mini_librispeech
+from speechbrain.utils.distributed import run_on_main
+from train import ASR, dataio_prepare
+
+from orion.client import report_objective
+
+logger = logging.getLogger(__name__)
+
+if __name__ == "__main__":
+ hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+ # Initialize ddp (useful only for multi-GPU DDP training)
+ sb.utils.distributed.ddp_init_group(run_opts)
+
+ # Load hyperparameters file with command-line overrides
+ with open(hparams_file) as fin:
+ hparams = load_hyperpyyaml(fin, overrides)
+
+ # Create experiment directory
+ sb.create_experiment_directory(
+ experiment_directory=hparams["output_folder"],
+ hyperparams_to_save=hparams_file,
+ overrides=overrides,
+ )
+
+ # Data preparation, to be run on only one process.
+ sb.utils.distributed.run_on_main(
+ prepare_mini_librispeech,
+ kwargs={
+ "data_folder": hparams["data_folder"],
+ "save_json_train": hparams["train_annotation"],
+ "save_json_valid": hparams["valid_annotation"],
+ "save_json_test": hparams["test_annotation"],
+ },
+ )
+
+ # We can now directly create the datasets for training, valid, and test
+ datasets = dataio_prepare(hparams)
+
+ # In this case, pre-training is essential because mini-librispeech is not
+ # big enough to train an end-to-end model from scratch. With bigger dataset
+ # you can train from scratch and avoid this step.
+ # We download the pretrained LM from HuggingFace (or elsewhere depending on
+ # the path given in the YAML file). The tokenizer is loaded at the same time.
+ run_on_main(hparams["pretrainer"].collect_files)
+ hparams["pretrainer"].load_collected(device=torch.device("cpu"))
+
+ # Trainer initialization
+ asr_brain = ASR(
+ modules=hparams["modules"],
+ opt_class=hparams["opt_class"],
+ hparams=hparams,
+ run_opts=run_opts,
+ checkpointer=hparams["checkpointer"],
+ )
+
+ # The `fit()` method iterates the training loop, calling the methods
+ # necessary to update the parameters of the model. Since all objects
+ # with changing state are managed by the Checkpointer, training can be
+ # stopped at any point, and will be resumed on next call.
+ asr_brain.fit(
+ asr_brain.hparams.epoch_counter,
+ datasets["train"],
+ datasets["valid"],
+ train_loader_kwargs=hparams["train_dataloader_opts"],
+ valid_loader_kwargs=hparams["valid_dataloader_opts"],
+ )
+
+ # Load best checkpoint for evaluation
+ valid_stats = asr_brain.evaluate(
+ test_set=datasets["valid"],
+ min_key="WER",
+ test_loader_kwargs=hparams["valid_dataloader_opts"],
+ )
+
+ report_objective(valid_stats)
diff --git a/examples/speechbrain_tutorial/train.py b/examples/speechbrain_tutorial/train.py
new file mode 100644
index 000000000..ad58f51ac
--- /dev/null
+++ b/examples/speechbrain_tutorial/train.py
@@ -0,0 +1,461 @@
+#!/usr/bin/env/python3
+# Code from The SpeechBrain Github repository :
+# https://github.com/speechbrain/speechbrain/blob/develop/templates/speech_recognition/ASR/train.py
+"""Recipe for training a sequence-to-sequence ASR system with mini-librispeech.
+The system employs an encoder, a decoder, and an attention mechanism
+between them. Decoding is performed with beam search coupled with a neural
+language model.
+
+To run this recipe, do the following:
+> python train.py train.yaml
+
+With the default hyperparameters, the system employs an LSTM encoder.
+The decoder is based on a standard GRU. Beam search coupled with an RNN language
+model is used on the top of decoder probabilities.
+
+The neural network is trained on both CTC and negative-log likelihood
+targets and sub-word units estimated with Byte Pairwise Encoding (BPE)
+are used as basic recognition tokens. Training is performed on the mini-librispeech
+dataset. Note that this is a tiny dataset used here just to
+provide a working example. To achieve a better performance you have to train with
+larger datasets, such as the full LibriSpeech one. In this case, to allow the
+model to converge, we pre-train it with a bigger one (trained on the full librispeech
+with the seq2seq 1k BPE recipe).
+
+The experiment file is flexible enough to support a large variety of
+different systems. By properly changing the parameter files, you can try
+different encoders, decoders, tokens (e.g, characters instead of BPE).
+
+This recipe assumes that the tokenizer and the LM are already trained.
+To avoid token mismatches, the tokenizer used for the acoustic model is
+the same use for the LM. The recipe downloads the pre-trained tokenizer
+and LM.
+
+If you would like to train a full system from scratch do the following:
+1- Train a tokenizer (see ../Tokenizer)
+2- Train a language model (see ../LM)
+3- Train the speech recognizer (with this code).
+
+
+Authors
+ * Mirco Ravanelli 2020
+ * Ju-Chieh Chou 2020
+ * Abdel Heba 2020
+ * Peter Plantinga 2020
+ * Samuele Cornell 2020
+"""
+
+import logging
+import sys
+
+import speechbrain as sb
+import torch
+from hyperpyyaml import load_hyperpyyaml
+from mini_librispeech_prepare import prepare_mini_librispeech
+from speechbrain.utils.distributed import run_on_main
+
+logger = logging.getLogger(__name__)
+
+
+# Brain class for speech recognition training
+class ASR(sb.Brain):
+ """Class that manages the training loop. See speechbrain.core.Brain."""
+
+ def compute_forward(self, batch, stage):
+ """Runs all the computation of the CTC + seq2seq ASR. It returns the
+ posterior probabilities of the CTC and seq2seq networks.
+
+ Arguments
+ ---------
+ batch : PaddedBatch
+ This batch object contains all the relevant tensors for computation.
+ stage : sb.Stage
+ One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+ Returns
+ -------
+ predictions : dict
+ At training time it returns predicted seq2seq log probabilities.
+ If needed it also returns the ctc output log probabilities.
+ At validation/test time, it returns the predicted tokens as well.
+ """
+ # We first move the batch to the appropriate device.
+ batch = batch.to(self.device)
+ feats, self.feat_lens = self.prepare_features(stage, batch.sig)
+ tokens_bos, _ = self.prepare_tokens(stage, batch.tokens_bos)
+
+ # Running the encoder (prevent propagation to feature extraction)
+ encoded_signal = self.modules.encoder(feats.detach())
+
+ # Embed tokens and pass tokens & encoded signal to decoder
+ embedded_tokens = self.modules.embedding(tokens_bos)
+ decoder_outputs, _ = self.modules.decoder(
+ embedded_tokens, encoded_signal, self.feat_lens
+ )
+
+ # Output layer for seq2seq log-probabilities
+ logits = self.modules.seq_lin(decoder_outputs)
+ predictions = {"seq_logprobs": self.hparams.log_softmax(logits)}
+
+ if self.is_ctc_active(stage):
+ # Output layer for ctc log-probabilities
+ ctc_logits = self.modules.ctc_lin(encoded_signal)
+ predictions["ctc_logprobs"] = self.hparams.log_softmax(ctc_logits)
+ elif stage == sb.Stage.VALID:
+ predictions["tokens"], _ = self.hparams.valid_search(
+ encoded_signal, self.feat_lens
+ )
+ elif stage == sb.Stage.TEST:
+ predictions["tokens"], _ = self.hparams.test_search(
+ encoded_signal, self.feat_lens
+ )
+
+ return predictions
+
+ def is_ctc_active(self, stage):
+ """Check if CTC is currently active.
+
+ Arguments
+ ---------
+ stage : sb.Stage
+ Currently executing stage.
+ """
+ if stage != sb.Stage.TRAIN:
+ return False
+ current_epoch = self.hparams.epoch_counter.current
+ return current_epoch <= self.hparams.number_of_ctc_epochs
+
+ def prepare_features(self, stage, wavs):
+ """Prepare features for computation on-the-fly
+
+ Arguments
+ ---------
+ stage : sb.Stage
+ Currently executing stage.
+ wavs : tuple
+ The input signals (tensor) and their lengths (tensor).
+ """
+ wavs, wav_lens = wavs
+
+ # Add augmentation if specified. In this version of augmentation, we
+ # concatenate the original and the augment batches in a single bigger
+ # batch. This is more memory-demanding, but helps to improve the
+ # performance. Change it if you run OOM.
+ if stage == sb.Stage.TRAIN:
+ if hasattr(self.modules, "env_corrupt"):
+ wavs_noise = self.modules.env_corrupt(wavs, wav_lens)
+ wavs = torch.cat([wavs, wavs_noise], dim=0)
+ wav_lens = torch.cat([wav_lens, wav_lens])
+
+ if hasattr(self.hparams, "augmentation"):
+ wavs = self.hparams.augmentation(wavs, wav_lens)
+
+ # Feature computation and normalization
+ feats = self.hparams.compute_features(wavs)
+ feats = self.modules.normalize(feats, wav_lens)
+
+ return feats, wav_lens
+
+ def prepare_tokens(self, stage, tokens):
+ """Double the tokens batch if features are doubled.
+
+ Arguments
+ ---------
+ stage : sb.Stage
+ Currently executing stage.
+ tokens : tuple
+ The tokens (tensor) and their lengths (tensor).
+ """
+ tokens, token_lens = tokens
+ if hasattr(self.modules, "env_corrupt") and stage == sb.Stage.TRAIN:
+ tokens = torch.cat([tokens, tokens], dim=0)
+ token_lens = torch.cat([token_lens, token_lens], dim=0)
+ return tokens, token_lens
+
+ def compute_objectives(self, predictions, batch, stage):
+ """Computes the loss given the predicted and targeted outputs. We here
+ do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
+ costs.
+
+ Arguments
+ ---------
+ predictions : dict
+ The output dict from `compute_forward`.
+ batch : PaddedBatch
+ This batch object contains all the relevant tensors for computation.
+ stage : sb.Stage
+ One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+ Returns
+ -------
+ loss : torch.Tensor
+ A one-element tensor used for backpropagating the gradient.
+ """
+ # Compute sequence loss against targets with EOS
+ tokens_eos, tokens_eos_lens = self.prepare_tokens(stage, batch.tokens_eos)
+ loss = sb.nnet.losses.nll_loss(
+ log_probabilities=predictions["seq_logprobs"],
+ targets=tokens_eos,
+ length=tokens_eos_lens,
+ label_smoothing=self.hparams.label_smoothing,
+ )
+
+ # Add ctc loss if necessary. The total cost is a weighted sum of
+ # ctc loss + seq2seq loss
+ if self.is_ctc_active(stage):
+ # Load tokens without EOS as CTC targets
+ tokens, tokens_lens = self.prepare_tokens(stage, batch.tokens)
+ loss_ctc = self.hparams.ctc_cost(
+ predictions["ctc_logprobs"], tokens, self.feat_lens, tokens_lens
+ )
+ loss *= 1 - self.hparams.ctc_weight
+ loss += self.hparams.ctc_weight * loss_ctc
+
+ if stage != sb.Stage.TRAIN:
+ # Converted predicted tokens from indexes to words
+ predicted_words = [
+ self.hparams.tokenizer.decode_ids(prediction).split(" ")
+ for prediction in predictions["tokens"]
+ ]
+ target_words = [words.split(" ") for words in batch.words]
+
+ # Monitor word error rate and character error rated at
+ # valid and test time.
+ self.wer_metric.append(batch.id, predicted_words, target_words)
+ self.cer_metric.append(batch.id, predicted_words, target_words)
+
+ return loss
+
+ def on_stage_start(self, stage, epoch):
+ """Gets called at the beginning of each epoch.
+
+ Arguments
+ ---------
+ stage : sb.Stage
+ One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+ epoch : int
+ The currently-starting epoch. This is passed
+ `None` during the test stage.
+ """
+ # Set up statistics trackers for this stage
+ # In this case, we would like to keep track of the word error rate (wer)
+ # and the character error rate (cer)
+ if stage != sb.Stage.TRAIN:
+ self.cer_metric = self.hparams.cer_computer()
+ self.wer_metric = self.hparams.error_rate_computer()
+
+ def on_stage_end(self, stage, stage_loss, epoch):
+ """Gets called at the end of an epoch.
+
+ Arguments
+ ---------
+ stage : sb.Stage
+ One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+ stage_loss : float
+ The average loss for all of the data processed in this stage.
+ epoch : int
+ The currently-starting epoch. This is passed
+ `None` during the test stage.
+ """
+
+ # Store the train loss until the validation stage.
+ stage_stats = {"loss": stage_loss}
+ if stage == sb.Stage.TRAIN:
+ self.train_stats = stage_stats
+
+ # Summarize the statistics from the stage for record-keeping.
+ else:
+ stage_stats["CER"] = self.cer_metric.summarize("error_rate")
+ stage_stats["WER"] = self.wer_metric.summarize("error_rate")
+
+ # Perform end-of-iteration things, like annealing, logging, etc.
+ if stage == sb.Stage.VALID:
+
+ # Update learning rate
+ old_lr, new_lr = self.hparams.lr_annealing(stage_stats["WER"])
+ sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+
+ # The train_logger writes a summary to stdout and to the logfile.
+ self.hparams.train_logger.log_stats(
+ stats_meta={"epoch": epoch, "lr": old_lr},
+ train_stats=self.train_stats,
+ valid_stats=stage_stats,
+ )
+
+ # Save the current checkpoint and delete previous checkpoints.
+ self.checkpointer.save_and_keep_only(
+ meta={"WER": stage_stats["WER"]},
+ min_keys=["WER"],
+ )
+
+ # We also write statistics about test data to stdout and to the logfile.
+ elif stage == sb.Stage.TEST:
+ self.hparams.train_logger.log_stats(
+ stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+ test_stats=stage_stats,
+ )
+ with open(self.hparams.wer_file, "w") as w:
+ self.wer_metric.write_stats(w)
+
+
+def dataio_prepare(hparams):
+ """This function prepares the datasets to be used in the brain class.
+ It also defines the data processing pipeline through user-defined functions.
+
+
+ Arguments
+ ---------
+ hparams : dict
+ This dictionary is loaded from the `train.yaml` file, and it includes
+ all the hyperparameters needed for dataset construction and loading.
+
+ Returns
+ -------
+ datasets : dict
+ Dictionary containing "train", "valid", and "test" keys that correspond
+ to the DynamicItemDataset objects.
+ """
+ # Define audio pipeline. In this case, we simply read the path contained
+ # in the variable wav with the audio reader.
+ @sb.utils.data_pipeline.takes("wav")
+ @sb.utils.data_pipeline.provides("sig")
+ def audio_pipeline(wav):
+ """Load the audio signal. This is done on the CPU in the `collate_fn`."""
+ sig = sb.dataio.dataio.read_audio(wav)
+ return sig
+
+ # Define text processing pipeline. We start from the raw text and then
+ # encode it using the tokenizer. The tokens with BOS are used for feeding
+ # decoder during training, the tokens with EOS for computing the cost function.
+ # The tokens without BOS or EOS is for computing CTC loss.
+ @sb.utils.data_pipeline.takes("words")
+ @sb.utils.data_pipeline.provides(
+ "words", "tokens_list", "tokens_bos", "tokens_eos", "tokens"
+ )
+ def text_pipeline(words):
+ """Processes the transcriptions to generate proper labels"""
+ yield words
+ tokens_list = hparams["tokenizer"].encode_as_ids(words)
+ yield tokens_list
+ tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
+ yield tokens_bos
+ tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
+ yield tokens_eos
+ tokens = torch.LongTensor(tokens_list)
+ yield tokens
+
+ # Define datasets from json data manifest file
+ # Define datasets sorted by ascending lengths for efficiency
+ datasets = {}
+ data_folder = hparams["data_folder"]
+ data_info = {
+ "train": hparams["train_annotation"],
+ "valid": hparams["valid_annotation"],
+ "test": hparams["test_annotation"],
+ }
+
+ for dataset in data_info:
+ datasets[dataset] = sb.dataio.dataset.DynamicItemDataset.from_json(
+ json_path=data_info[dataset],
+ replacements={"data_root": data_folder},
+ dynamic_items=[audio_pipeline, text_pipeline],
+ output_keys=[
+ "id",
+ "sig",
+ "words",
+ "tokens_bos",
+ "tokens_eos",
+ "tokens",
+ ],
+ )
+ hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
+
+ # Sorting training data with ascending order makes the code much
+ # faster because we minimize zero-padding. In most of the cases, this
+ # does not harm the performance.
+ if hparams["sorting"] == "ascending":
+ datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
+ hparams["train_dataloader_opts"]["shuffle"] = False
+
+ elif hparams["sorting"] == "descending":
+ datasets["train"] = datasets["train"].filtered_sorted(
+ sort_key="length", reverse=True
+ )
+ hparams["train_dataloader_opts"]["shuffle"] = False
+
+ elif hparams["sorting"] == "random":
+ hparams["train_dataloader_opts"]["shuffle"] = True
+
+ else:
+ raise NotImplementedError("sorting must be random, ascending or descending")
+ return datasets
+
+
+if __name__ == "__main__":
+
+ # Reading command line arguments
+ hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+ # Initialize ddp (useful only for multi-GPU DDP training)
+ sb.utils.distributed.ddp_init_group(run_opts)
+
+ # Load hyperparameters file with command-line overrides
+ with open(hparams_file) as fin:
+ hparams = load_hyperpyyaml(fin, overrides)
+
+ # Create experiment directory
+ sb.create_experiment_directory(
+ experiment_directory=hparams["output_folder"],
+ hyperparams_to_save=hparams_file,
+ overrides=overrides,
+ )
+
+ # Data preparation, to be run on only one process.
+ sb.utils.distributed.run_on_main(
+ prepare_mini_librispeech,
+ kwargs={
+ "data_folder": hparams["data_folder"],
+ "save_json_train": hparams["train_annotation"],
+ "save_json_valid": hparams["valid_annotation"],
+ "save_json_test": hparams["test_annotation"],
+ },
+ )
+
+ # We can now directly create the datasets for training, valid, and test
+ datasets = dataio_prepare(hparams)
+
+ # In this case, pre-training is essential because mini-librispeech is not
+ # big enough to train an end-to-end model from scratch. With bigger dataset
+ # you can train from scratch and avoid this step.
+ # We download the pretrained LM from HuggingFace (or elsewhere depending on
+ # the path given in the YAML file). The tokenizer is loaded at the same time.
+ run_on_main(hparams["pretrainer"].collect_files)
+ hparams["pretrainer"].load_collected(device=torch.device("cpu"))
+
+ # Trainer initialization
+ asr_brain = ASR(
+ modules=hparams["modules"],
+ opt_class=hparams["opt_class"],
+ hparams=hparams,
+ run_opts=run_opts,
+ checkpointer=hparams["checkpointer"],
+ )
+
+ # The `fit()` method iterates the training loop, calling the methods
+ # necessary to update the parameters of the model. Since all objects
+ # with changing state are managed by the Checkpointer, training can be
+ # stopped at any point, and will be resumed on next call.
+ asr_brain.fit(
+ asr_brain.hparams.epoch_counter,
+ datasets["train"],
+ datasets["valid"],
+ train_loader_kwargs=hparams["train_dataloader_opts"],
+ valid_loader_kwargs=hparams["valid_dataloader_opts"],
+ )
+
+ # Load best checkpoint for evaluation
+ test_stats = asr_brain.evaluate(
+ test_set=datasets["test"],
+ min_key="WER",
+ test_loader_kwargs=hparams["test_dataloader_opts"],
+ )
diff --git a/examples/speechbrain_tutorial/train.yaml b/examples/speechbrain_tutorial/train.yaml
new file mode 100644
index 000000000..ea0099026
--- /dev/null
+++ b/examples/speechbrain_tutorial/train.yaml
@@ -0,0 +1,334 @@
+# ############################################################################
+# Model: E2E ASR with attention-based ASR
+# Encoder: CRDNN
+# Decoder: GRU + beamsearch + RNNLM
+# Tokens: 1000 BPE
+# losses: CTC+ NLL
+# Training: mini-librispeech
+# Pre-Training: librispeech 960h
+# Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga, Samuele Cornell 2020
+# # ############################################################################
+
+# Seed needs to be set at top of yaml, before objects with parameters are instantiated
+seed: 2602
+__set_seed: !apply:torch.manual_seed [!ref ]
+
+# If you plan to train a system on an HPC cluster with a big dataset,
+# we strongly suggest doing the following:
+# 1- Compress the dataset in a single tar or zip file.
+# 2- Copy your dataset locally (i.e., the local disk of the computing node).
+# 3- Uncompress the dataset in the local folder.
+# 4- Set data_folder with the local path
+# Reading data from the local disk of the compute node (e.g. $SLURM_TMPDIR with SLURM-based clusters) is very important.
+# It allows you to read the data much faster without slowing down the shared filesystem.
+
+data_folder: ../data # In this case, data will be automatically downloaded here.
+data_folder_rirs: !ref # noise/ris dataset will automatically be downloaded here
+output_folder: !ref results/CRDNN_BPE_960h_LM/
+wer_file: !ref /wer.txt
+save_folder: !ref /save
+train_log: !ref /train_log.txt
+
+# Language model (LM) pretraining
+# NB: To avoid mismatch, the speech recognizer must be trained with the same
+# tokenizer used for LM training. Here, we download everything from the
+# speechbrain HuggingFace repository. However, a local path pointing to a
+# directory containing the lm.ckpt and tokenizer.ckpt may also be specified
+# instead. E.g if you want to use your own LM / tokenizer.
+pretrained_path: speechbrain/asr-crdnn-rnnlm-librispeech
+
+
+# Path where data manifest files will be stored. The data manifest files are created by the
+# data preparation script
+train_annotation: ../train.json
+valid_annotation: ../valid.json
+test_annotation: ../test.json
+
+# The train logger writes training statistics to a file, as well as stdout.
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+ save_file: !ref
+
+# Training parameters
+number_of_epochs: 15
+number_of_ctc_epochs: 5
+batch_size: 8
+lr: 0.1
+ctc_weight: 0.5
+sorting: ascending
+ckpt_interval_minutes: 15 # save checkpoint every N min
+label_smoothing: 0.1
+
+# Dataloader options
+train_dataloader_opts:
+ batch_size: !ref
+
+valid_dataloader_opts:
+ batch_size: !ref
+
+test_dataloader_opts:
+ batch_size: !ref
+
+
+# Feature parameters
+sample_rate: 16000
+n_fft: 400
+n_mels: 40
+
+# Model parameters
+activation: !name:torch.nn.LeakyReLU
+dropout: 0.15 # 0.0 - 0.3
+cnn_blocks: 2
+cnn_channels: (128, 256)
+inter_layer_pooling_size: (2, 2)
+cnn_kernelsize: (3, 3)
+time_pooling_size: 4
+rnn_class: !name:speechbrain.nnet.RNN.LSTM
+rnn_layers: 4
+rnn_neurons: 1024
+rnn_bidirectional: True
+dnn_blocks: 2
+dnn_neurons: 512
+emb_size: 128
+dec_neurons: 1024
+output_neurons: 1000 # Number of tokens (same as LM)
+blank_index: 0
+bos_index: 0
+eos_index: 0
+
+# Decoding parameters
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0
+valid_beam_size: 8
+test_beam_size: 80
+eos_threshold: 1.5
+using_max_attn_shift: True
+max_attn_shift: 240
+lm_weight: 0.50
+ctc_weight_decode: 0.0
+coverage_penalty: 1.5
+temperature: 1.25
+temperature_lm: 1.25
+
+# The first object passed to the Brain class is this "Epoch Counter"
+# which is saved by the Checkpointer so that training can be resumed
+# if it gets interrupted at any point.
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+ limit: !ref
+
+# Feature extraction
+compute_features: !new:speechbrain.lobes.features.Fbank
+ sample_rate: !ref
+ n_fft: !ref
+ n_mels: !ref
+
+# Feature normalization (mean and std)
+normalize: !new:speechbrain.processing.features.InputNormalization
+ norm_type: global
+
+# Added noise and reverb come from OpenRIR dataset, automatically
+# downloaded and prepared with this Environmental Corruption class.
+env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt
+ openrir_folder: !ref
+ babble_prob: 0.0
+ reverb_prob: 0.0
+ noise_prob: 1.0
+ noise_snr_low: 0
+ noise_snr_high: 15
+
+# Adds speech change + time and frequency dropouts (time-domain implementation).
+augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
+ sample_rate: !ref
+ speeds: [95, 100, 105]
+
+# The CRDNN model is an encoder that combines CNNs, RNNs, and DNNs.
+encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN
+ input_shape: [null, null, !ref ]
+ activation: !ref
+ dropout: !ref
+ cnn_blocks: !ref
+ cnn_channels: !ref
+ cnn_kernelsize: !ref
+ inter_layer_pooling_size: !ref
+ time_pooling: True
+ using_2d_pooling: False
+ time_pooling_size: !ref
+ rnn_class: !ref
+ rnn_layers: !ref
+ rnn_neurons: !ref
+ rnn_bidirectional: !ref
+ rnn_re_init: True
+ dnn_blocks: !ref
+ dnn_neurons: !ref
+ use_rnnp: False
+
+# Embedding (from indexes to an embedding space of dimension emb_size).
+embedding: !new:speechbrain.nnet.embedding.Embedding
+ num_embeddings: !ref
+ embedding_dim: !ref
+
+# Attention-based RNN decoder.
+decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
+ enc_dim: !ref
+ input_size: !ref
+ rnn_type: gru
+ attn_type: location
+ hidden_size: !ref
+ attn_dim: 1024
+ num_layers: 1
+ scaling: 1.0
+ channels: 10
+ kernel_size: 100
+ re_init: True
+ dropout: !ref
+
+# Linear transformation on the top of the encoder.
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+ input_size: !ref
+ n_neurons: !ref
+
+# Linear transformation on the top of the decoder.
+seq_lin: !new:speechbrain.nnet.linear.Linear
+ input_size: !ref
+ n_neurons: !ref
+
+# Final softmax (for log posteriors computation).
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+ apply_log: True
+
+# Cost definition for the CTC part.
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+ blank_index: !ref
+
+
+# Tokenizer initialization
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+
+# Objects in "modules" dict will have their parameters moved to the correct
+# device, as well as having train()/eval() called on them by the Brain class
+modules:
+ encoder: !ref
+ embedding: !ref
+ decoder: !ref
+ ctc_lin: !ref
+ seq_lin: !ref
+ normalize: !ref
+ env_corrupt: !ref
+ lm_model: !ref
+
+# Gathering all the submodels in a single model object.
+model: !new:torch.nn.ModuleList
+ - - !ref
+ - !ref
+ - !ref
+ - !ref
+ - !ref
+
+# This is the RNNLM that is used according to the Huggingface repository
+# NB: It has to match the pre-trained RNNLM!!
+lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
+ output_neurons: !ref
+ embedding_dim: !ref
+ activation: !name:torch.nn.LeakyReLU
+ dropout: 0.0
+ rnn_layers: 2
+ rnn_neurons: 2048
+ dnn_blocks: 1
+ dnn_neurons: 512
+ return_hidden: True # For inference
+
+# Beamsearch is applied on the top of the decoder. If the language model is
+# given, a language model is applied (with a weight specified in lm_weight).
+# If ctc_weight is set, the decoder uses CTC + attention beamsearch. This
+# improves the performance, but slows down decoding. For a description of
+# the other parameters, please see the speechbrain.decoders.S2SRNNBeamSearchLM.
+
+# It makes sense to have a lighter search during validation. In this case,
+# we don't use the LM and CTC probabilities during decoding.
+valid_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
+ embedding: !ref
+ decoder: !ref
+ linear: !ref
+ ctc_linear: !ref
+ bos_index: !ref
+ eos_index: !ref
+ blank_index: !ref
+ min_decode_ratio: !ref
+ max_decode_ratio: !ref
+ beam_size: !ref
+ eos_threshold: !ref
+ using_max_attn_shift: !ref
+ max_attn_shift: !ref
+ coverage_penalty: !ref
+ temperature: !ref
+
+# The final decoding on the test set can be more computationally demanding.
+# In this case, we use the LM + CTC probabilities during decoding as well.
+# Please, remove this part if you need a faster decoder.
+test_search: !new:speechbrain.decoders.S2SRNNBeamSearchLM
+ embedding: !ref
+ decoder: !ref
+ linear: !ref
+ ctc_linear: !ref
+ language_model: !ref
+ bos_index: !ref
+ eos_index: !ref
+ blank_index: !ref
+ min_decode_ratio: !ref
+ max_decode_ratio: !ref
+ beam_size: !ref
+ eos_threshold: !ref
+ using_max_attn_shift: !ref
+ max_attn_shift: !ref
+ coverage_penalty: !ref
+ lm_weight: !ref
+ ctc_weight: !ref
+ temperature: !ref
+ temperature_lm: !ref
+
+# This function manages learning rate annealing over the epochs.
+# We here use the NewBoB algorithm, that anneals the learning rate if
+# the improvements over two consecutive epochs is less than the defined
+# threshold.
+lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
+ initial_value: !ref
+ improvement_threshold: 0.0025
+ annealing_factor: 0.8
+ patient: 0
+
+# This optimizer will be constructed by the Brain class after all parameters
+# are moved to the correct device. Then it will be added to the checkpointer.
+opt_class: !name:torch.optim.Adadelta
+ lr: !ref
+ rho: 0.95
+ eps: 1.e-8
+
+# Functions that compute the statistics to track during the validation step.
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+ split_tokens: True
+
+# This object is used for saving the state of training both so that it
+# can be resumed if it gets interrupted, and also so that the best checkpoint
+# can be later loaded for evaluation or inference.
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+ checkpoints_dir: !ref
+ recoverables:
+ model: !ref
+ scheduler: !ref
+ normalizer: !ref
+ counter: !ref
+
+# This object is used to pretrain the language model and the tokenizers
+# (defined above). In this case, we also pretrain the ASR model (to make
+# sure the model converges on a small amount of data)
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+ collect_in: !ref
+ loadables:
+ lm: !ref
+ tokenizer: !ref
+ model: !ref
+ paths:
+ lm: !ref /lm.ckpt
+ tokenizer: !ref /tokenizer.ckpt
+ model: !ref /asr.ckpt