NVIDIA · pzelasko · Dec 5, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
@@ -24,11 +24,12 @@ spl_tokens:
 model:
   sample_rate: 16000
   label_smoothing: 0.0
-  context_len_for_AR_decoding: 5 # Length of input prompt tokens. For example, in Canary models, we use [BOS,src_lang,task,tgt_lang,pnc] and thus the length is 5
+  use_loss_mask_for_prompt: false
   log_prediction: true # enables logging sample predictions in the output during training
 
   # Important ! Set the prompt format to the class you need
   prompt_format: ???   # Options supported: ["canary"]
+  prompt_defaults: null
 
   model_defaults:
     asr_enc_hidden: 1024

diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -16,6 +16,7 @@
 
 import torch.utils.data
 from lhotse import CutSet
+from lhotse.cut import MixedCut
 from lhotse.dataset import AudioSamples
 from lhotse.dataset.collation import collate_vectors
 
@@ -99,7 +100,7 @@ def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch:
             prompt_lens=prompt_lens,
             prompted_transcript=prompts_with_answers,
             prompted_transcript_lens=prompts_with_answers_lens,
-            cuts=cuts.drop_in_memory_data(),
+            cuts=_drop_in_memory_data(cuts),
         )
 
     def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
@@ -111,3 +112,24 @@ def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
     pass
+
+
+def _drop_in_memory_data(
+    cuts: CutSet,
+    _fields=frozenset(MixedCut.__dataclass_fields__.keys()),
+) -> CutSet:
+    """Workaround for an edge case in cuts.drop_in_memory_data() on MixedCut with Lhotse<1.29.0"""
+    ans = []
+    for c in cuts:
+        # Not a mixed cut or a mixed cut that wasn't assigned any extra attributes.
+        if not isinstance(c, MixedCut) or _fields.issuperset(c.__dict__.keys()):
+            ans.append(c.drop_in_memory_data())
+        else:
+            extra_attrs = {k: v for k, v in c.__dict__.items() if k not in _fields}
+            for k in extra_attrs:
+                delattr(c, k)
+            ans.append(c.drop_in_memory_data())
+            for k, v in extra_attrs.items():
+                setattr(ans[-1], k, v)
+                setattr(c, k, v)
+    return CutSet(ans)
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
@@ -70,7 +70,8 @@ def lens_to_mask(lens, max_length):
     Create a mask from a tensor of lengths.
     """
     batch_size = lens.shape[0]
-    mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None]
+    arange = torch.arange(max_length, device=lens.device)
+    mask = arange.expand(batch_size, max_length) < lens.unsqueeze(1)
     return mask
 
 
@@ -697,24 +698,33 @@ def training_step(self, batch: PromptedAudioToTextMiniBatch, batch_nb):
             return torch.tensor([0.0])
 
         input_ids, labels = batch.get_decoder_inputs_outputs()
+        input_ids_lens = batch.prompted_transcript_lens - 1
+
+        num_frames = batch.audio_lens.sum().float()
+        num_tokens = batch.prompted_transcript_lens.sum().float()
+        tot_frames = torch.as_tensor(batch.audio.numel(), device=num_frames.device, dtype=torch.float)
+        tot_tokens = torch.as_tensor(batch.prompted_transcript.numel(), device=num_frames.device, dtype=torch.float)
 
         transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch.audio,
             input_signal_length=batch.audio_lens,
             transcript=input_ids,
-            transcript_length=batch.prompted_transcript_lens,
+            transcript_length=input_ids_lens,
         )
 
-        audio_loss = self.loss(log_probs=transf_log_probs, labels=labels)
+        # Mask components: 1) discard padding  &  2) discard prompt (notice the negation)
+        # For a full decoder sequence O with len M, the loss mask skips the first element,
+        # covering the remaining M-1 elements - hence we subtract 1 from prompt lens to account BOS.
+        loss_mask = None
+        if self.cfg.get("use_loss_mask_for_prompt", False):
+            maxlen = batch.prompted_transcript.shape[1] - 1
+            loss_mask = lens_to_mask(input_ids_lens, maxlen) & ~lens_to_mask(batch.prompt_lens - 1, maxlen)
+        audio_loss = self.loss(log_probs=transf_log_probs, labels=labels, output_mask=loss_mask)
 
-        num_frames = batch.audio_lens.sum()
-        num_tokens = batch.prompted_transcript_lens.sum()
-        tot_frames = batch.audio.numel()
-        tot_tokens = batch.prompted_transcript.numel()
         tensorboard_logs = {
             'train_loss': audio_loss,
-            'learning_rate': self._optimizer.param_groups[0]['lr'],
-            'batch_size': batch.audio.shape[0],
+            'learning_rate': torch.as_tensor(self._optimizer.param_groups[0]['lr']),
+            'batch_size': torch.as_tensor(batch.audio.shape[0]),
             'num_frames': num_frames,
             'num_tokens': num_tokens,
             'input_to_padding_ratio': num_frames / tot_frames,
@@ -725,6 +735,7 @@ def training_step(self, batch: PromptedAudioToTextMiniBatch, batch_nb):
 
     def validation_pass(self, batch: PromptedAudioToTextMiniBatch, batch_idx, dataloader_idx=0, eval_mode="val"):
         input_ids, labels = batch.get_decoder_inputs_outputs()
+        input_ids_lens = batch.prompted_transcript_lens - 1
 
         transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch.audio,
@@ -733,11 +744,16 @@ def validation_pass(self, batch: PromptedAudioToTextMiniBatch, batch_idx, datalo
             transcript_length=batch.prompted_transcript_lens,
         )
 
-        transf_loss = self.loss(log_probs=transf_log_probs, labels=labels)
-        self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1])
-        output_dict = {
-            f'{eval_mode}_loss': transf_loss,
-        }
+        # Mask components: 1) discard padding  &  2) discard prompt (notice the negation)
+        # For a full decoder sequence O with len M, the loss mask skips the first element,
+        # covering the remaining M-1 elements - hence we subtract 1 from prompt lens to account BOS.
+        loss_mask = None
+        if self.cfg.get("use_loss_mask_for_prompt", False):
+            maxlen = batch.prompted_transcript.shape[1] - 1
+            loss_mask = lens_to_mask(input_ids_lens, maxlen) & ~lens_to_mask(batch.prompt_lens - 1, maxlen)
+        transf_loss = self.loss(log_probs=transf_log_probs, labels=labels, output_mask=loss_mask)
+        self.val_loss(loss=transf_loss, num_measurements=loss_mask.long().sum())
+        output_dict = {f'{eval_mode}_loss': transf_loss}
 
         self.wer.update(
             predictions=enc_states,
@@ -983,6 +999,8 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'text_field': config.get('text_field', 'answer'),
             'lang_field': config.get('lang_field', 'target_lang'),
             'channel_selector': config.get('channel_selector', None),
+            'pad_min_duration': config.get('pad_min_duration', 1.0),
+            'pad_direction': config.get('pad_direction', 'both'),
         }
 
         temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))

diff --git a/nemo/collections/asr/modules/transformer/transformer_modules.py b/nemo/collections/asr/modules/transformer/transformer_modules.py
@@ -58,27 +58,7 @@ def _build_pos_enc(self, hidden_size, max_sequence_length, device=None):
         self.register_buffer('pos_enc', pos_enc)
 
     def forward(self, position_ids):
-        max_pos_id = position_ids.max()
-        # update positional encoding if needed
-        if max_pos_id >= self._max_sequence_length:
-            logging.warning(
-                f'Max position id {max_pos_id} is greater than max sequence length {self._max_sequence_length}. Expanding position embeddings just for this batch. This is not expected to work very well. Consider chunking your input into smaller sequences.'
-            )
-            self._build_pos_enc(
-                hidden_size=self._hidden_size,
-                max_sequence_length=max_pos_id + 1,
-                device=position_ids.device,
-            )
-
         embeddings = torch.embedding(self.pos_enc, position_ids)
-
-        # Revert expansion of position embeddings since this wall checkpoint size mismatches.
-        if max_pos_id >= self._max_sequence_length:
-            self._build_pos_enc(
-                hidden_size=self._hidden_size,
-                max_sequence_length=self._max_sequence_length,
-                device=position_ids.device,
-            )
         return embeddings
 
 

diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
@@ -135,6 +135,9 @@ class LhotseDataLoadingConfig:
     rir_enabled: bool = False
     rir_path: str | None = None  # str, must point to a lhotse RecordingSet manifest
     rir_prob: float = 0.5
+    #   f. Padding to a minimum duration. Examples shorter than this will be padded, others are unaffected.
+    pad_min_duration: Optional[float] = None
+    pad_direction: str = "right"  # "right" | "left" | "both" | "random"
 
     # 5. Other Lhotse options.
     text_field: str = "text"  # key to read the transcript from
@@ -278,6 +281,9 @@ def get_lhotse_dataloader_from_config(
             keep_excessive_supervisions=config.keep_excessive_supervisions,
         )
 
+    if config.pad_min_duration is not None:
+        cuts = cuts.pad(duration=config.pad_min_duration, direction=config.pad_direction, preserve_id=True)
+
     # Duration filtering, same as native NeMo dataloaders.
     # We can filter after the augmentations because they are applied only when calling load_audio().
     cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration))

diff --git a/nemo/collections/common/prompts/__init__.py b/nemo/collections/common/prompts/__init__.py
@@ -1,4 +1,5 @@
 from nemo.collections.common.prompts.canary import CanaryPromptFormatter
+from nemo.collections.common.prompts.canary2 import Canary2PromptFormatter
 from nemo.collections.common.prompts.fn import get_prompt_format_fn, registered_prompt_format_fn
 from nemo.collections.common.prompts.formatter import PromptFormatter
 from nemo.collections.common.prompts.gemma import GemmaPromptFormatter