From 1489ac7c239eff1cf4e3ee43ac606b7608cdee5a Mon Sep 17 00:00:00 2001
From: George <37293288+Jorjeous@users.noreply.github.com>
Date: Wed, 14 Feb 2024 20:40:32 +0400
Subject: [PATCH 01/28] coldfix (#8412)

Signed-off-by: George Zelenfroynd <gzelenfroind@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 tools/speech_data_explorer/data_explorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/speech_data_explorer/data_explorer.py b/tools/speech_data_explorer/data_explorer.py
index 65eafc5c9d49..e628a43f8ebc 100755
--- a/tools/speech_data_explorer/data_explorer.py
+++ b/tools/speech_data_explorer/data_explorer.py
@@ -126,7 +126,7 @@ def parse_args():
     # automaticly going in comparison mode, if there is names_compared argument
     if args.names_compared is not None:
         comparison_mode = True
-        logging.error("comparison mod set to true")
+        logging.info("comparison mod set to true")
     else:
         comparison_mode = False
 

From 5c70924e1d5812a458919aa4ff427f8000827c8c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 14 Feb 2024 16:15:42 -0700
Subject: [PATCH 02/28] Fixed errors in the CTM gen functions (#8416) (#8420)

Signed-off-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 scripts/speaker_tasks/create_alignment_manifest.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/speaker_tasks/create_alignment_manifest.py b/scripts/speaker_tasks/create_alignment_manifest.py
index 91825ac0d7e9..63907f844e6d 100644
--- a/scripts/speaker_tasks/create_alignment_manifest.py
+++ b/scripts/speaker_tasks/create_alignment_manifest.py
@@ -32,7 +32,10 @@ def get_seg_info_from_ctm_line(
     """
     Get time stamp information and speaker labels from CTM lines.
     This is following CTM format appeared in `Rich Transcription Meeting Eval Plan: RT09` document.
-    
+
+    CTM Format: 
+        <SOURCE>< <CHANNEL> <BEG-TIME> <DURATION> <TOKEN> <CONF> <TYPE> <SPEAKER>
+
     Args:
         ctm_list (list): List containing CTM items. e.g.: ['sw02001-A', '1', '0.000', '0.200', 'hello', '0.98', 'lex', 'speaker3']
         output_precision (int): Precision for CTM outputs in integer.
@@ -47,6 +50,8 @@ def get_seg_info_from_ctm_line(
     end = float(ctm_list[start_time_index]) + float(ctm_list[duration_index])
     start = round(start, output_precision)
     end = round(end, output_precision)
+    if type(speaker_id) == str:
+        speaker_id = speaker_id.strip()
     return start, end, speaker_id
 
 
@@ -106,7 +111,7 @@ def create_new_ctm_entry(session_name, speaker_id, wordlist, alignments, output_
                 start_time=align1,
                 duration=align2,
                 token=word,
-                conf=0,
+                conf=None,
                 type_of_token='lex',
                 speaker=speaker_id,
                 output_precision=output_precision,
@@ -245,7 +250,7 @@ def create_manifest_with_alignments(
         prev_end = 0
         for i in range(len(lines)):
             ctm = lines[i].split(' ')
-            speaker_id, start, end = get_seg_info_from_ctm_line(ctm_list=ctm, output_precision=output_precision)
+            start, end, speaker_id = get_seg_info_from_ctm_line(ctm_list=ctm, output_precision=output_precision)
             interval = start - prev_end
 
             if (i == 0 and interval > 0) or (i > 0 and interval > silence_dur_threshold):

From 859b8f2ef6d64019898e762b1a984f90614d4baf Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 14 Feb 2024 16:18:45 -0700
Subject: [PATCH 03/28] Add change_vocabulary and save_tokenizers() support to
 Multitask ASR models (#8357) (#8367)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add change_vocabulary and save_tokenizers() support

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../asr/models/aed_multitask_models.py        | 135 +++++++++++++++++-
 nemo/collections/asr/parts/mixins/mixins.py   |  95 +++++++++++-
 2 files changed, 228 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 0197c49a1cb7..4fb88b208076 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import itertools
 import os
 import tempfile
@@ -23,7 +24,7 @@
 import numpy as np
 import torch
 import torch.distributed as dist
-from omegaconf import DictConfig, OmegaConf, open_dict
+from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from torchmetrics.text import SacreBLEUScore
 from tqdm.auto import tqdm
@@ -247,6 +248,138 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
 
         logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}")
 
+    def change_vocabulary(
+        self,
+        new_tokenizer_dir: Union[str, DictConfig],
+        new_tokenizer_type: str,
+        decoding_cfg: Optional[DictConfig] = None,
+        prompt_format: Optional[str] = None,
+    ):
+        """
+        Changes vocabulary used during AED decoding process. Use this method when fine-tuning on from pre-trained model.
+        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
+        use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need
+        model to learn capitalization, punctuation and/or special characters.
+
+        Args:
+            new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`)
+            new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`.
+            decoding_cfg: A config for the decoding, which is optional. If the decoding type
+                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
+            prompt_format: A string alias of the object that represents the prompt structure.
+                If not None, it will be used to update the prompt format.
+        """
+        if isinstance(new_tokenizer_dir, (dict, DictConfig)):
+            if new_tokenizer_type == 'agg':
+                if not isinstance(new_tokenizer_dir, DictConfig):
+                    new_tokenizer_dir = OmegaConf.create(new_tokenizer_dir)
+
+                new_tokenizer_cfg = new_tokenizer_dir
+            else:
+                raise ValueError(
+                    f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}'
+                )
+        else:
+            new_tokenizer_cfg = None
+
+        if new_tokenizer_cfg is not None:
+            tokenizer_cfg = new_tokenizer_cfg
+        else:
+            if not os.path.isdir(new_tokenizer_dir):
+                raise NotADirectoryError(
+                    f'New tokenizer dir must be non-empty path to a directory. But instead got: {new_tokenizer_dir}'
+                )
+
+            if new_tokenizer_type.lower() not in ('bpe', 'wpe'):
+                raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`')
+
+            tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type})
+
+        if prompt_format is None:
+            prompt_format = self.cfg.prompt_format
+
+        # Setup the tokenizer
+        self._setup_tokenizer(tokenizer_cfg)
+
+        # Initialize a dummy vocabulary
+        vocabulary = self.tokenizer.tokenizer.get_vocab()
+
+        # Setup Decoder
+        transf_decoder_cfg_dict = self.transf_decoder.to_config_dict()
+
+        vocab_size = 8 * ceil(self.tokenizer.vocab_size / 8)
+
+        # Auto inject vocab size for `get_transformer`
+        with open_dict(transf_decoder_cfg_dict):
+            if 'config_dict' in transf_decoder_cfg_dict:
+                transf_decoder_cfg_dict['config_dict']['vocab_size'] = vocab_size
+
+        original_decoder_state_dict = self.transf_decoder.state_dict()
+        self.transf_decoder = EncDecMultiTaskModel.from_config_dict(transf_decoder_cfg_dict)
+
+        # Partially load the original state dict into the new decoder
+        decoder_state_dict = self.transf_decoder.state_dict()
+        for og_key, og_value in original_decoder_state_dict.items():
+            if og_key in decoder_state_dict and og_value.shape == decoder_state_dict[og_key].shape:
+                decoder_state_dict[og_key] = og_value
+            else:
+                logging.warning(
+                    f"Skipping key `{og_key}` in the `transf_decoder` module from original state dict due "
+                    f"to shape mismatch after change in vocabulary.\n"
+                    f"Original shape: {og_value.shape}, New shape: {decoder_state_dict[og_key].shape}"
+                )
+
+        self.transf_decoder.load_state_dict(decoder_state_dict)
+
+        # Setup token classifier
+        with open_dict(self.cfg.head):
+            self.cfg.head.num_classes = vocab_size
+
+        del self.log_softmax
+        self.log_softmax = EncDecMultiTaskModel.from_config_dict(self.cfg.head)
+
+        # Weight tying - if using TokenClassifier only
+        if isinstance(self.log_softmax, TokenClassifier):
+            self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
+
+        # Initialize weights of token classifier
+        std_init_range = 1 / self.cfg.model_defaults.lm_dec_hidden ** 0.5
+        self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
+
+        # Setup Decoding class
+        if decoding_cfg is None:
+            # Assume same decoding config as before
+            decoding_cfg = self.cfg.decoding
+
+        # Assert the decoding config with all hyper parameters
+        decoding_cls = OmegaConf.structured(MultiTaskDecodingConfig)
+        decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls))
+        decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
+
+        del self.decoding
+        self.decoding = MultiTaskDecoding(
+            decoding_cfg=decoding_cfg,
+            transformer_decoder=self.transf_decoder,
+            log_softmax_module=self.log_softmax,
+            tokenizer=self.tokenizer,
+        )
+
+        with open_dict(self.cfg.decoding):
+            self.cfg.decoding = decoding_cfg
+
+        # Setup loss
+        with open_dict(self.cfg.loss):
+            self.cfg.loss.pad_id = self.tokenizer.pad_id
+
+        del self.loss
+        self.loss = EncDecMultiTaskModel.from_config_dict(self.cfg.loss)
+
+        # Update config
+        with open_dict(self.cfg):
+            self.cfg.prompt_format = prompt_format
+
+        logging.info(f"Changed decoder to output to {vocabulary} vocabulary.")
+
     @torch.no_grad()
     def transcribe(
         self,
diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py
index eeac9d3c78ad..006f028a0a1d 100644
--- a/nemo/collections/asr/parts/mixins/mixins.py
+++ b/nemo/collections/asr/parts/mixins/mixins.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import os
+import shutil
+import tarfile
 from abc import ABC, abstractmethod
 from typing import List
 
@@ -25,7 +27,7 @@
 from nemo.collections.asr.parts.utils import asr_module_utils
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common import tokenizers
-from nemo.utils import logging
+from nemo.utils import app_state, logging
 
 
 class ASRBPEMixin(ABC):
@@ -372,6 +374,97 @@ def _cleanup_aggregate_config_and_artifacts_if_needed(self):
                 if akey.startswith('tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.'):
                     self.artifacts.pop(akey)
 
+    def save_tokenizers(self, directory: str):
+        """
+        Save the model tokenizer(s) to the specified directory.
+
+        Args:
+            directory: The directory to save the tokenizer(s) to.
+        """
+        if not hasattr(self, 'cfg'):
+            raise RuntimeError(
+                "The model has not been initialized with a tokenizer yet. Please call the model's "
+                "__init__ and _setup_tokenizer methods first."
+            )
+
+        if self.tokenizer_type == 'agg':
+            for lang in self.tokenizer.langs:
+                subconfig = self.cfg.tokenizer.langs.get(lang)
+                new_dir = os.path.join(directory, lang)
+                self._extract_tokenizer_from_config(subconfig, new_dir)
+        else:
+            self._extract_tokenizer_from_config(self.cfg.tokenizer, directory)
+
+    def _extract_tokenizer_from_config(self, tokenizer_cfg: DictConfig, dir: str):
+        """
+        Extracts the tokenizer from the config and write the objects to dir.
+        The file may be from a local path (new model init) or from a .nemo file (restored model).
+        If its from a newly initialized model, the file is copied to dir.
+        If its from a restored model, the file is extracted from the .nemo file and copied to dir.
+
+        Args:
+            tokenizer_cfg: The tokenizer config to extract the tokenizer from.
+            dir: The directory to write the tokenizer objects to.
+        """
+        if not os.path.exists(dir):
+            os.makedirs(dir, exist_ok=True)
+
+        nemo_file_objects = []
+
+        for k, v in tokenizer_cfg.items():
+            # Check if the value is a filepath (new model init) or has `nemo:` in it (restored model)
+            if isinstance(v, str) and os.path.exists(v):
+                # local file from first instantiation
+                loc = shutil.copy2(v, dir)
+                logging.info(f"Saved {k} at {loc}")
+
+            if isinstance(v, str) and v.startswith('nemo:'):
+                nemo_object_name = v[5:]
+                nemo_file_objects.append(nemo_object_name)
+
+        if len(nemo_file_objects) > 0:
+            logging.debug(f"Copying the following nemo file objects to {dir}: {nemo_file_objects}")
+
+            if not hasattr(self, 'model_guid'):
+                raise ValueError(
+                    "The model does not have a model_guid attribute. "
+                    "Please ensure that the model has been restored from a .nemo file."
+                )
+
+            appstate = app_state.AppState()
+            restore_path = appstate.get_model_metadata_from_guid(self.model_guid).restoration_path
+            if restore_path is None:
+                raise ValueError(
+                    "The model has not been restored from a .nemo file. Cannot extract the tokenizer "
+                    "as the nemo file cannot be located."
+                )
+
+            # Read the nemo file without fully extracting all contents
+            # we start with an assumption of uncompressed tar,
+            # which should be true for versions 1.7.0 and above
+            tar_header = "r:"
+            try:
+                tar_test = tarfile.open(restore_path, tar_header)
+                tar_test.close()
+            except tarfile.ReadError:
+                # can be older checkpoint => try compressed tar
+                tar_header = "r:gz"
+            tar = tarfile.open(restore_path, tar_header)
+
+            for nemo_object_name in nemo_file_objects:
+                members = [x for x in tar.getmembers() if nemo_object_name in x.name]
+                for member in members:
+                    tar.extract(member, dir)
+
+                    new_name = member.name.split("_")[1:]
+                    if len(new_name) > 1:
+                        new_name = "_".join(new_name)
+                    else:
+                        new_name = new_name[0]
+                    os.rename(os.path.join(dir, member.name), os.path.join(dir, new_name))
+
+                    logging.info(f"Saved {nemo_object_name} at {os.path.join(dir, new_name)}")
+
 
 class ASRModuleMixin(ASRAdapterModelMixin):
     """

From b951d6465c724e466aed10c1841dc2c24ca7832f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 14 Feb 2024 17:05:49 -0700
Subject: [PATCH 04/28] fix path location and branch (#8314)

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* updat ebranch in tutorial

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 tutorials/asr/ASR_with_Transducers.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb
index 052c6a6b65eb..d20042b9b970 100644
--- a/tutorials/asr/ASR_with_Transducers.ipynb
+++ b/tutorials/asr/ASR_with_Transducers.ipynb
@@ -31,7 +31,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'r1.21.0'\n",
+        "BRANCH = 'main'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Grab the config we'll use in this example\n",
@@ -191,7 +191,7 @@
         "                    file_id[file_id.find('-')+1 : file_id.rfind('-')],\n",
         "                    file_id + '.wav')\n",
         "\n",
-        "                duration = librosa.core.get_duration(filename=audio_path)\n",
+        "                duration = librosa.core.get_duration(path=audio_path)\n",
         "\n",
         "                # Write the metadata to the manifest\n",
         "                metadata = {\n",
@@ -338,7 +338,7 @@
       "source": [
         "from omegaconf import OmegaConf, open_dict\n",
         "\n",
-        "config = OmegaConf.load(\"/content/configs/contextnet_rnnt.yaml\")"
+        "config = OmegaConf.load(\"configs/contextnet_rnnt.yaml\")"
       ]
     },
     {

From d43d29f97d1a7d0c75edec8d041e112db0410be4 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Thu, 15 Feb 2024 14:36:23 -0800
Subject: [PATCH 05/28] Add TP comm overlap knobs to AutocastTransformerLayer
 (#8290)

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../nlp/modules/common/megatron/transformer.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index 9e9c7b526782..9bbe863d34ff 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -796,6 +796,12 @@ def __init__(
         drop_path_rate: float = 0,
         use_emha: bool = False,
         ub_tp_comm_overlap: bool = False,
+        ub_bulk_wgrad: bool = True,
+        ub_bulk_dgrad: bool = True,
+        ub_split_ag: bool = True,
+        ub_split_rs: bool = True,
+        ub_atomic_gemm_ag: bool = False,
+        ub_atomic_gemm_rs: bool = False,
         autocast_dtype: Any = 16,
         zero_centered_gamma: bool = False,
         device: str = 'cuda',
@@ -828,6 +834,12 @@ def __init__(
             fuse_qkv_params=True,
             zero_centered_gamma=zero_centered_gamma,
             ub_tp_comm_overlap=ub_tp_comm_overlap,
+            ub_bulk_wgrad=ub_bulk_wgrad,
+            ub_bulk_dgrad=ub_bulk_dgrad,
+            ub_split_ag=ub_split_ag,
+            ub_split_rs=ub_split_rs,
+            ub_atomic_gemm_ag=ub_atomic_gemm_ag,
+            ub_atomic_gemm_rs=ub_atomic_gemm_rs,
             device=device,
         )
         # use_emha=use_emha,
@@ -1076,6 +1088,12 @@ def build_layer(layer_number):
                     autocast_dtype=precision,
                     use_emha=use_emha,
                     ub_tp_comm_overlap=ub_tp_comm_overlap,
+                    ub_bulk_wgrad=config.tp_comm_bulk_wgrad,
+                    ub_bulk_dgrad=config.tp_comm_bulk_dgrad,
+                    ub_split_ag=config.tp_comm_split_ag,
+                    ub_split_rs=config.tp_comm_split_rs,
+                    ub_atomic_gemm_ag=config.tp_comm_atomic_ag,
+                    ub_atomic_gemm_rs=config.tp_comm_atomic_rs,
                     zero_centered_gamma=normalization == 'layernorm1p',
                     device='cpu' if config.use_cpu_initialization else 'cuda',
                 )

From 7ae94a0198fffea4c3c58e78f6013bea4a799f77 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 17:26:18 -0700
Subject: [PATCH 06/28] add deallocate pipeline output optimization (#8279)
 (#8318)

* add deallocate pipeline output optimization

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 56f5d146e964..269279d8e856 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -475,6 +475,7 @@ def build_transformer_config(self) -> TransformerConfig:
             'recompute_num_layers': recompute_num_layers,
             'distribute_saved_activations': False,  # not currently used in NeMo
             'fp8': None,
+            'deallocate_pipeline_outputs': True,
         }
 
         # populate the transformer config dict

From 344f655e82db91d4bf506dd1625c4587b0031c99 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 17:26:48 -0700
Subject: [PATCH 07/28] remove assertion (#8302) (#8321)

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py       | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 3bdc1182dda3..0a9c65be42ab 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1303,11 +1303,6 @@ def setup(self, stage=None):
         self.init_global_step = self.trainer.global_step
 
         if self.rampup_batch_size:
-            optimizer = self.cfg.optim.get('name', None)
-            assert (
-                optimizer == 'fused_adam'
-            ), f'{optimizer} optimizer is not supported yet with rampup batch size. Please, use fused_adam optimizer instead.'
-
             num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR
             num_microbatch_calculator.update(self.init_consumed_samples, consistency_check=False)
             self.prev_consumed_samples = self.init_consumed_samples

From 9a0c79dcd176f95fd8dfc4e562b77e56d1d92781 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 17:28:32 -0700
Subject: [PATCH 08/28] Keep max_seqlen and cu_seqlens_argmin for later
 micro-batches when PP>1 (#8334) (#8346)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 0a9c65be42ab..9c3657d4c4ef 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -937,8 +937,8 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
 
             # Transfer needed data to GPU
             required_keys = set()
-            max_seqlen = batch.pop('max_seqlen').squeeze() if 'max_seqlen' in batch else None
-            cu_seqlens_argmin = batch.pop('cu_seqlens_argmin') if 'cu_seqlens_argmin' in batch else None
+            max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+            cu_seqlens_argmin = batch['cu_seqlens_argmin'] if 'cu_seqlens_argmin' in batch else None
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
                 required_keys.update(batch.keys())
             else:

From a28e1532873855a164736f744c725a362dfff01c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 17:28:49 -0700
Subject: [PATCH 09/28] Enable megatron core loggers for GPT pretraining
 (#8354) (#8384)

* Logging changes tested for gpt_pretraining

* Additional args

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../conf/megatron_gpt_config.yaml             |  7 ++++
 .../conf/megatron_model_base_config.yaml      |  3 +-
 .../language_modeling/megatron_base_model.py  | 40 ++++++++++++++++++-
 .../language_modeling/megatron_gpt_model.py   |  6 +++
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 63d2297838c3..004e8b584a13 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -211,6 +211,13 @@ model:
   ## Network
   sharp: False # Enable the use of SHARP for NCCL data-parallel communications. This is going to be ignored if the network doesn't support SHARP.
   
+  ## Megatron timers
+  enable_megatron_timers: False
+  megatron_timer_kwargs:
+    log_every_n_steps: 10
+    log_mode: minmax
+    barrier: False
+
   data:
    # Path to data must be specified by the user.
     # Supports List, String and Dictionary
diff --git a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml
index 4da8177685a1..235bf3d3f227 100644
--- a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml
@@ -37,4 +37,5 @@ normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sq
 num_moe_experts: 1 # When >1, FFNs are changed to MoE layers
 moe_frequency: 1 # every Nth ffn layer will be made MoE 
 moe_dropout: 0.0 # Dropout value for MoE layers
-use_flash_attention: false # Use flash attention in self-attention module
\ No newline at end of file
+use_flash_attention: false # Use flash attention in self-attention module
+enable_megatron_timers: false # Megatron timers
\ No newline at end of file
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 269279d8e856..5321a307b2c4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -69,6 +69,13 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    from megatron.core import Timers
+
+    HAVE_MEGATRON_CORE_TIMERS = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_MEGATRON_CORE_TIMERS = False
+
 __all__ = ["MegatronBaseModel"]
 
 
@@ -124,6 +131,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             else torch.float32
         )
 
+        self.megatron_timers = None
+        if self.cfg.get('enable_megatron_timers', False) and HAVE_MEGATRON_CORE_TIMERS:
+            self.megatron_timers_cfg = dict(self.cfg.get('megatron_timer_kwargs', dict()))
+            if 'log_every_n_steps' not in self.megatron_timers_cfg:
+                self.megatron_timers_cfg['log_every_n_steps'] = self.trainer.log_every_n_steps
+            if 'log_option' not in self.megatron_timers_cfg:
+                self.megatron_timers_cfg['log_option'] = 'minmax'  # minmax, max, all
+            if 'barrier' not in self.megatron_timers_cfg:
+                self.megatron_timers_cfg['barrier'] = False
+            self.megatron_timers = Timers(log_level=2, log_option=self.megatron_timers_cfg['log_option'])
+
         # set the megatron core model parallel config
         self.model_parallel_config: ModelParallelConfig = self.build_model_parallel_config()
 
@@ -615,6 +633,13 @@ def sync_overlap_parameters(self, params=None):
     def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unused: Optional[int] = 0) -> None:
         super().on_train_batch_end(outputs, dataloader_iter, batch_idx)
 
+        # Megatron Timers
+        if self.megatron_timers:
+            if self.global_step % self.megatron_timers_cfg["log_every_n_steps"] == 0:
+                logging.info(
+                    "\n " + self.megatron_timers.get_all_timers_string(barrier=self.megatron_timers_cfg["barrier"])
+                )
+
         # TODO: Replace with newer override for scheduler.step() instead of
         # search for plugins for fp16 GradScalar
         if self.trainer.precision_plugin is not None and isinstance(
@@ -1044,7 +1069,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             and megatron_amp_O2,  # NeMo does not currently support fp16 training with megatron amp O2, eval and inference is supported
             "bf16": self.torch_dtype == torch.bfloat16 and megatron_amp_O2,
             "params_dtype": self.params_dtype,
-            "timers": None,  # NeMo does not currently support megatron core timers
+            "timers": self.megatron_timers,
             "async_tensor_model_parallel_allreduce": self.cfg.get('tensor_model_parallel_world_size', 1) > 1
             and not self.cfg.get('sequence_parallel', False),
             "pipeline_dtype": pipeline_dtype,
@@ -1157,3 +1182,16 @@ def configure_sharded_model(self):
             # Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid
             # out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op.
             self.model = self.model.cuda(torch.cuda.current_device())
+
+    def megatron_timer_start(self, name, log_level):
+        if self.megatron_timers:
+            self.megatron_timers(name, log_level).start(barrier=False)
+
+    def megatron_timer_stop(self, name):
+        if self.megatron_timers:
+            self.megatron_timers(name).stop()
+
+    def optimizer_step(self, *args, **kwargs):
+        self.megatron_timer_start('optimizer', log_level=1)
+        super().optimizer_step(*args, **kwargs)
+        self.megatron_timer_stop('optimizer')
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 9c3657d4c4ef..2770090a7c1e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -654,8 +654,11 @@ def training_step(self, dataloader_iter, batch_idx):
 
         # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
+            self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
             self.allreduce_sequence_parallel_gradients()
+            self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
 
+        self.megatron_timer_start('gradient_allreduce', log_level=1)
         if self.use_fsdp:
             # Reduce the gradients omitted from FSDP-sharding
             self.allreduce_fsdp_sharding_omitted_gradients()
@@ -673,12 +676,15 @@ def training_step(self, dataloader_iter, batch_idx):
             # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
             # so we all-reduce gradients after the pipeline
             self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
+        self.megatron_timer_stop('gradient_allreduce')
 
         if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and self.cfg.get(
             'share_embeddings_and_output_weights', True
         ):
+            self.megatron_timer_start('allreduce_first_last_embeddings', log_level=1)
             # when using pipeline parallelism the first and last stage must keep embeddings in sync
             self.allreduce_first_last_embeddings()
+            self.megatron_timer_stop('allreduce_first_last_embeddings')
 
         ## logging
         if self.log_train_loss:

From e89f8be91231021c51298f55f4dd8e1470cd3047 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 17:34:36 -0700
Subject: [PATCH 10/28] Fix dreambooth data sampler issue (#8400) (#8413)

* Turn on drop last

* Some neva fixes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../multimodal/models/text_to_image/dreambooth/dreambooth.py   | 2 +-
 nemo/collections/multimodal/parts/utils.py                     | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index ce82da9bd171..704f8b39371a 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -487,7 +487,7 @@ def setup_training_data(self, cfg):
             global_batch_size=self.cfg.global_batch_size,
             data_parallel_rank=parallel_state.get_data_parallel_rank(),
             data_parallel_size=parallel_state.get_data_parallel_world_size(),
-            drop_last=False,
+            drop_last=True,
         )
 
         self._train_dl = torch.utils.data.DataLoader(
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index c82e0cd37140..4d4e952db0ce 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -22,9 +22,11 @@
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from transformers import CLIPImageProcessor
 
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
 from nemo.utils import AppState, logging
+from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:
     from megatron.core import dist_checkpointing
@@ -361,6 +363,7 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.activations_checkpoint_method = None
             neva_cfg.precision = trainer.precision
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
+            neva_cfg.apply_rope_fusion = False
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
         model = MegatronNevaModel.restore_from(

From 8575c315fa71e43631508e8500af535f38c65433 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 17:35:08 -0700
Subject: [PATCH 11/28] add ensemble decoding fix (#8427) (#8433)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 examples/asr/transcribe_speech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index 3492f2ee99a4..2caa3009c4d3 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -265,7 +265,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         asr_model.encoder.set_default_att_context_size(cfg.att_context_size)
 
     # Setup decoding strategy
-    if hasattr(asr_model, 'change_decoding_strategy'):
+    if hasattr(asr_model, 'change_decoding_strategy') and hasattr(asr_model, 'decoding'):
         if isinstance(asr_model.decoding, MultiTaskDecoding):
             cfg.multitask_decoding.compute_langs = cfg.compute_langs
             cfg.multitask_decoding.preserve_alignments = cfg.preserve_alignment

From d440d7d2ce03a77ee7969a31610f7ff3bdc47b0e Mon Sep 17 00:00:00 2001
From: Pratyush Muthukumar <30813477+PannuMuthu@users.noreply.github.com>
Date: Thu, 15 Feb 2024 19:37:08 -0800
Subject: [PATCH 12/28] NeVA Tutorial Notebook (#8217)

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* init commit - neva tutorial

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* NeVA tutorial notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add inference via script

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* requested changes

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

* add codeblocks to run torchrun in notebook

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>

---------

Signed-off-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Pratyush Muthukumar <pannumuthu@gmail.com>
Co-authored-by: Pratyush Muthukumar <pmuthukumar@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 tutorials/multimodal/NeVA Tutorial.ipynb | 366 +++++++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 tutorials/multimodal/NeVA Tutorial.ipynb

diff --git a/tutorials/multimodal/NeVA Tutorial.ipynb b/tutorials/multimodal/NeVA Tutorial.ipynb
new file mode 100644
index 000000000000..7a9a1a3a7b4b
--- /dev/null
+++ b/tutorials/multimodal/NeVA Tutorial.ipynb	
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a2225742c5996304",
+   "metadata": {},
+   "source": [
+    "# NeVA Training / Inference Tutorial\n",
+    "\n",
+    "### Note:\n",
+    "Currently, this notebook must be run in a NeMo container. An example command to launch the container:\n",
+    "\n",
+    "```\n",
+    "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo --shm-size=8g \\\n",
+    "     -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n",
+    "      stack=67108864 <your_nemo_container>\n",
+    "```\n",
+    "\n",
+    "## Introduction\n",
+    "\n",
+    "This notebook illustrates how to train and perform inference using NeVA with the NeMo Toolkit. NeVA originates from [LLaVA](https://github.com/haotian-liu/LLaVA) (Large Language and Vision Assistant) and is a powerful multimodal image-text instruction tuned model optimized within the NeMo framework. \n",
+    "\n",
+    "\n",
+    "This tutorial will guide you through the following topics:\n",
+    "1. Training a NeVA model\n",
+    "2. Performing inference with the trained model\n",
+    "\n",
+    "## Datasets\n",
+    "\n",
+    "After downloading all below datasets for pretraining and instruction tuning, your dataset directory should look something similar to:\n",
+    "\n",
+    "```\n",
+    "LLaVA-Pretrain-LCS-558K\n",
+    "├── blip_laion_cc_sbu_558k.json\n",
+    "├── images\n",
+    "LLaVA-Instruct-mixture\n",
+    "├── llava_v1_5_mix665k.json\n",
+    "└── images\n",
+    "    └── ...\n",
+    "```\n",
+    "\n",
+    "### Pre-Training Dataset\n",
+    "\n",
+    "The pre-training dataset is open-sourced from the LLaVA implementation and can be downloaded [here](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain). The dataset consists of a 558K subset of the LAION-CC-SBU dataset with BLIP captions. \n",
+    "\n",
+    "The associated images for pretraining can be downloaded via HuggingFace [here](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/images.zip).\n",
+    "\n",
+    "### Instruction Tuning Dataset\n",
+    "\n",
+    "The instruction tuning annotations are sourced from the LLaVA implementation and are available [here](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json).\n",
+    "\n",
+    "The associated images for the mixture instruction tuning annotations can be found [here](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#visual-instruction-tuning). After extracting, the data should be formatted as follows:\n",
+    "\n",
+    "```\n",
+    "    images\n",
+    "      ├── coco\n",
+    "      │    └── train2017\n",
+    "      ├── gqa\n",
+    "      │    └── images\n",
+    "      ├── ocr_vqa\n",
+    "      │    └── images\n",
+    "      ├── textvqa\n",
+    "      │    └── train_images\n",
+    "      └── vg\n",
+    "           ├── VG_100K\n",
+    "           └── VG_100K_2\n",
+    "```\n",
+    "\n",
+    "## Training\n",
+    "\n",
+    "\n",
+    "### Feature Alignment Pre-Training\n",
+    "\n",
+    "We provide a set of scripts for pre-training and fine-tuning which can be kicked off with CLI flags defining specified arguments. \n",
+    "\n",
+    "An example of a pre-training script execution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3930351e",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! torchrun --nproc_per_node=4 /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_pretrain.py \\\n",
+    " ++cluster_type=BCP \\\n",
+    " trainer.precision=bf16 \\\n",
+    " trainer.num_nodes=1 \\\n",
+    " trainer.devices=4 \\\n",
+    " trainer.val_check_interval=1000 \\\n",
+    " trainer.limit_val_batches=5 \\\n",
+    " trainer.log_every_n_steps=1 \\\n",
+    " trainer.max_steps=1000 \\\n",
+    " model.megatron_amp_O2=True \\\n",
+    " model.micro_batch_size=1 \\\n",
+    " model.global_batch_size=2 \\\n",
+    " model.tensor_model_parallel_size=4 \\\n",
+    " model.pipeline_model_parallel_size=1 \\\n",
+    " model.mcore_gpt=True \\\n",
+    " model.transformer_engine=True \\\n",
+    " model.data.data_path=/path/to/datasets/LLaVA-Pretrain-LCS-558K/blip_laion_cc_sbu_558k.json \\\n",
+    " model.data.image_folder=/path/to/dataset/LLaVA-Pretrain-LCS-558K/images \\\n",
+    " model.tokenizer.library=sentencepiece \\\n",
+    " model.tokenizer.model=/path/to/tokenizer/model \\\n",
+    " model.encoder_seq_length=4096 \\\n",
+    " model.num_layers=32 \\\n",
+    " model.hidden_size=4096 \\\n",
+    " model.ffn_hidden_size=16384 \\\n",
+    " model.num_attention_heads=32 \\\n",
+    " model.normalization=layernorm1p \\\n",
+    " model.do_layer_norm_weight_decay=False \\\n",
+    " model.apply_query_key_layer_scaling=True \\\n",
+    " model.activation=squared-relu \\\n",
+    " model.headscale=False \\\n",
+    " model.position_embedding_type=rope \\\n",
+    " model.rotary_percentage=0.5 \\\n",
+    " model.num_query_groups=null \\\n",
+    " model.data.num_workers=0 \\\n",
+    " model.mm_cfg.llm.from_pretrained=/path/to/checkpoint \\\n",
+    " model.mm_cfg.llm.model_type=nvgpt \\\n",
+    " model.data.conv_template=nvgpt \\\n",
+    " model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \\\n",
+    " model.mm_cfg.vision_encoder.from_hf=True \\\n",
+    " model.data.image_token_len=256 \\\n",
+    " model.optim.name=\"fused_adam\" \\\n",
+    " exp_manager.create_checkpoint_callback=True \\\n",
+    " exp_manager.create_wandb_logger=False \\\n",
+    " exp_manager.wandb_logger_kwargs.project=neva_demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b619e0a",
+   "metadata": {},
+   "source": [
+    "\n",
+    "\n",
+    "**Note**: To initialize training a model from scratch rather than from a pretrained checkpoint, you may specify `null` instead of a path in the CLI arguments.\n",
+    "\n",
+    "### Image-Language Pair Instruction Fine-Tuning\n",
+    "\n",
+    "Fine-tuning can also be run from within the container via a similar command leveraging the `neva_finetune.py` script.\n",
+    "\n",
+    "An example of an image-text pair instruction tuning script execution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97963224",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! torchrun --nproc_per_node=4 /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_finetune.py \\\n",
+    "++cluster_type=BCP \\\n",
+    " trainer.precision=bf16 \\\n",
+    " trainer.num_nodes=1 \\\n",
+    " trainer.devices=1 \\\n",
+    " trainer.val_check_interval=100 \\\n",
+    " trainer.limit_val_batches=50 \\\n",
+    " trainer.max_steps=4900 \\\n",
+    " model.megatron_amp_O2=True \\\n",
+    " model.micro_batch_size=4 \\\n",
+    " model.global_batch_size=32 \\\n",
+    " model.tensor_model_parallel_size=1 \\\n",
+    " model.pipeline_model_parallel_size=1 \\\n",
+    " model.mcore_gpt=True \\\n",
+    " model.transformer_engine=True \\\n",
+    " model.data.data_path=/path/to/dataset/LLaVA-Pretrain-LCS-558K/blip_laion_cc_sbu_558k.json \\\n",
+    " model.data.image_folder=/path/to/dataset/LLaVA-Pretrain-LCS-558K/images \\\n",
+    " model.tokenizer.library=megatron \\\n",
+    " model.tokenizer.model=/path/to/tokenizer \\\n",
+    " model.encoder_seq_length=4096 \\\n",
+    " model.num_layers=24 \\\n",
+    " model.hidden_size=2048 \\\n",
+    " model.ffn_hidden_size=5440 \\\n",
+    " model.num_attention_heads=16 \\\n",
+    " model.normalization=layernorm1p \\\n",
+    " model.do_layer_norm_weight_decay=False \\\n",
+    " model.apply_query_key_layer_scaling=True \\\n",
+    " model.activation=fast-swiglu \\\n",
+    " model.headscale=False \\\n",
+    " model.position_embedding_type=rope \\\n",
+    " model.rotary_percentage=0.5 \\\n",
+    " model.num_query_groups=null \\\n",
+    " model.data.num_workers=8 \\\n",
+    " model.mm_cfg.llm.from_pretrained=/path/to/checkpoint \\\n",
+    " model.mm_cfg.llm.model_type=nvgpt \\\n",
+    " exp_manager.create_checkpoint_callback=True \\\n",
+    " model.data.conv_template=nvgpt \\\n",
+    " model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \\\n",
+    " model.mm_cfg.vision_encoder.from_hf=True \\\n",
+    " model.data.image_token_len=256 \\\n",
+    " model.optim.name=\"fused_adam\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d69e937c",
+   "metadata": {},
+   "source": [
+    "## Inference\n",
+    "\n",
+    "### From Pre-trained Checkpoints\n",
+    "\n",
+    "If you would like to use NeVA for inference from pre-trained checkpoint via HuggingFace, you can convert from HuggingFace to `.nemo` first.\n",
+    "\n",
+    "First, download the model checkpoint from HuggingFace [here](https://huggingface.co/liuhaotian/llava-v1.5-7b). The tokenizer (stored as `tokenizer.model` within the pretrained checkpoint) must be modified with the following commands:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d30003f",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! cd /opt/sentencepiece/src/\n",
+    "! protoc --python_out=/opt/NeMo/scripts/tokenizers/ sentencepiece_model.proto\n",
+    "! python /opt/NeMo/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \\\n",
+    "--input_file /path/to/tokenizer.model \\\n",
+    "--output_file /path/to/tokenizer_neva.model \\\n",
+    "--is_userdefined \\\n",
+    "--tokens \"<extra_id_0>\" \"<extra_id_1>\" \"<extra_id_2>\" \"<extra_id_3>\" \\\n",
+    "         \"<extra_id_4>\" \"<extra_id_5>\" \"<extra_id_6>\" \"<extra_id_7>\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "470c093b",
+   "metadata": {},
+   "source": [
+    "Finally, convert to `.nemo` via the provided script:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f398c26",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! python /opt/NeMo/examples/multimodal/mllm/neva/convert_hf_llava_to_neva.py \\\n",
+    "--in-file /path/to/llava-v1.5-7b \\\n",
+    "--out-file /path/to/llava-v1.5-7b.nemo \\\n",
+    "--tokenizer-model /path/to/tokenizer_neva.model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5235639a",
+   "metadata": {},
+   "source": [
+    "### Running Inference\n",
+    "\n",
+    "NeVA inference via the NeMo framework can be quickly spun up via the NeMo Launcher and a few modifications to use the default NeVA inference config file.\n",
+    "\n",
+    "Inference can be run with a similar command leveraging the provided inference script `neva_evaluation.py` within the container.\n",
+    "\n",
+    "An example of an inference script execution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee0156ea",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! torchrun --nproc_per_node=1 /opt/NeMo/examples/multimodal/multimodal_llm/neva/neva_evaluation.py \\\n",
+    "tensor_model_parallel_size=1 \\\n",
+    "pipeline_model_parallel_size=1 \\\n",
+    "neva_model_file=/path/to/checkpoint \\\n",
+    "trainer.devices=1 \\\n",
+    "trainer.precision=bf16 \\\n",
+    "prompt_file=/path/to/prompt/file \\\n",
+    "inference.images_base_path=/path/to/image \\\n",
+    "output_file=path/for/output/file/ \\\n",
+    "inference.temperature=0.2 \\\n",
+    "inference.top_k=0 \\\n",
+    "inference.top_p=0.9 \\\n",
+    "inference.greedy=False \\\n",
+    "inference.add_BOS=False \\\n",
+    "inference.all_probs=False \\\n",
+    "inference.repetition_penalty=1.2 \\\n",
+    "inference.insert_image_token=null \\\n",
+    "inference.tokens_to_generate=256 \\\n",
+    "quantization.algorithm=awq \\\n",
+    "quantization.enable=False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d989385",
+   "metadata": {},
+   "source": [
+    "#### Running Inference via Launcher\n",
+    "\n",
+    "Inference can also be run via the NeMo Launcher, where parameters are specified in the inference config file rather than CLI arguments. To customize the default config provided in `conf/config.yaml` for NeVA inference, see below.\n",
+    "\n",
+    "##### Inference Config Setup\n",
+    "1. Modify `fw_inference` within `defaults` to use `neva/inference` \n",
+    "2. In `stages`, ensure that `fw_inference` is included\n",
+    "3. Within the `inference.yaml` default NeVA inference config file, ensure that the path to the `prompt` file, `neva_model_file`, and `images_base_path` within `inference` are specified.\n",
+    "\n",
+    "Once either the necessary checkpoints have been loaded or the training workflow is complete, inference can be executed within the launcher pipeline with the following command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68d434ff",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "! python3 main.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From cca91417eda188444adedd4a402216abdb69462a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 15 Feb 2024 23:54:42 -0700
Subject: [PATCH 13/28] mcore customization doc minor fix (#8421) (#8437)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 docs/source/nlp/nemo_megatron/mcore_customization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/nlp/nemo_megatron/mcore_customization.rst b/docs/source/nlp/nemo_megatron/mcore_customization.rst
index 1f6648d9c899..618ece3f2e48 100644
--- a/docs/source/nlp/nemo_megatron/mcore_customization.rst
+++ b/docs/source/nlp/nemo_megatron/mcore_customization.rst
@@ -1,7 +1,7 @@
 Megatron Core Customization
 ---------------------------
 
-Megatron Core (Mcore) offers a range of functionalities, one of the most notable being the ability for users to train GPT models on an epic scale. Users can use ``megatron.core.models.gpt.GPTModel`` (Mcore GPTModel) to initialize the model, and then pretrain/load weights into the model. Mcore GPTModel adopts the typical GPT structure, beginning with embedding layer, positional encoding, followed by a series of transformer layers and finally output layer. 
+Megatron Core (Mcore) offers a range of functionalities, one of the most notable being the ability for users to train Transformer models on an epic scale. Users can enable decoder/GPT variants by using ``megatron.core.models.gpt.GPTModel`` (Mcore GPTModel) to initialize the model, and then pretrain/load weights into the model. Mcore GPTModel adopts the typical GPT structure, beginning with embedding layer, positional encoding, followed by a series of transformer layers and finally output layer.
 
 In the rapidly advancing world of LLM, it is increasingly important to experiment with various configurations of the transformer block within each transformer layer. Some of these configurations involve the use of different module classes. While it is possible to achieve this with “if else” statements in Mcore, doing so makes Mcore less readable and less maintainable in the long term. Mcore spec intends to solve this challenge by allowing users to specify a customization of the transformer block in each layer, without modifying code in mcore. 
 We will dive more into the details of mcore spec in the first section of this blog. Then, we will demonstrate the usefulness of mcore spec using Falcon as an example.

From 7110f686e58e670cb8a1755e2d9188ad8791e4da Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Fri, 16 Feb 2024 12:33:05 +0400
Subject: [PATCH 14/28] Add `loop_labels` algorithm for TDT greedy decoding
 (#8215)

* Add `loop_labels` algorithm for TDT greedy decoding

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use `loop_labels` by default

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Loop labels greedy decoding v2

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments. Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched hypotheses

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add tests for batched alignments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix comment

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix test

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add computer for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix TDT decoding algorithm

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Use loop frames by default for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Remove "loop frames" implementation for TDT

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add comments

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Fix confidence. Use tensor for durations.

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../parts/submodules/rnnt_greedy_decoding.py  | 199 +++----------
 .../submodules/tdt_loop_labels_computer.py    | 268 ++++++++++++++++++
 2 files changed, 301 insertions(+), 166 deletions(-)
 create mode 100644 nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py

diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index 5af83a53dfc4..a39f4f5746c3 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -35,6 +35,7 @@
 
 from nemo.collections.asr.modules import rnnt_abstract
 from nemo.collections.asr.parts.submodules.rnnt_loop_labels_computer import GreedyBatchedRNNTLoopLabelsComputer
+from nemo.collections.asr.parts.submodules.tdt_loop_labels_computer import GreedyBatchedTDTLoopLabelsComputer
 from nemo.collections.asr.parts.utils import rnnt_utils
 from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
 from nemo.collections.common.parts.rnn import label_collate
@@ -2638,8 +2639,20 @@ def __init__(
 
         # Depending on availability of `blank_as_pad` support
         # switch between more efficient batch decoding technique
+        self._decoding_computer = None
         if self.decoder.blank_as_pad:
-            self._greedy_decode = self._greedy_decode_blank_as_pad
+            # batched "loop frames" is not implemented for TDT
+            self._decoding_computer = GreedyBatchedTDTLoopLabelsComputer(
+                decoder=self.decoder,
+                joint=self.joint,
+                blank_index=self._blank_index,
+                durations=self.durations,
+                max_symbols_per_step=self.max_symbols,
+                preserve_alignments=preserve_alignments,
+                preserve_frame_confidence=preserve_frame_confidence,
+                confidence_method_cfg=confidence_method_cfg,
+            )
+            self._greedy_decode = self._greedy_decode_blank_as_pad_loop_labels
         else:
             self._greedy_decode = self._greedy_decode_masked
 
@@ -2685,179 +2698,33 @@ def forward(
 
         return (packed_result,)
 
-    def _greedy_decode_blank_as_pad(
+    def _greedy_decode_masked(
         self,
         x: torch.Tensor,
         out_len: torch.Tensor,
         device: torch.device,
         partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None,
     ):
-        if partial_hypotheses is not None:
-            raise NotImplementedError("`partial_hypotheses` support is not supported")
-
-        with torch.inference_mode():
-            # x: [B, T, D]
-            # out_len: [B]
-            # device: torch.device
-
-            # Initialize list of Hypothesis
-            batchsize = x.shape[0]
-            hypotheses = [
-                rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize)
-            ]
-
-            # Initialize Hidden state matrix (shared by entire batch)
-            hidden = None
-
-            # If alignments need to be preserved, register a danling list to hold the values
-            if self.preserve_alignments:
-                # alignments is a 3-dimensional dangling list representing B x T x U
-                for hyp in hypotheses:
-                    hyp.alignments = [[]]
-
-            # If confidence scores need to be preserved, register a danling list to hold the values
-            if self.preserve_frame_confidence:
-                # frame_confidence is a 3-dimensional dangling list representing B x T x U
-                for hyp in hypotheses:
-                    hyp.frame_confidence = [[]]
-
-            # Last Label buffer + Last Label without blank buffer
-            # batch level equivalent of the last_label
-            last_label = torch.full([batchsize, 1], fill_value=self._blank_index, dtype=torch.long, device=device)
-
-            # Mask buffers
-            blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device)
-
-            # Get max sequence length
-            max_out_len = out_len.max()
-
-            # skip means the number of frames the next decoding step should "jump" to. When skip == 1
-            # it means the next decoding step will just use the next input frame.
-            skip = 1
-            for time_idx in range(max_out_len):
-                if skip > 1:  # if skip > 1 at the current step, we decrement it and skip the current frame.
-                    skip -= 1
-                    continue
-                f = x.narrow(dim=1, start=time_idx, length=1)  # [B, 1, D]
-
-                # need_to_stay is a boolean indicates whether the next decoding step should remain in the same frame.
-                need_to_stay = True
-                symbols_added = 0
-
-                # Reset blank mask
-                blank_mask.mul_(False)
-
-                # Update blank mask with time mask
-                # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch)
-                # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len
-                blank_mask = time_idx >= out_len
-
-                # Start inner loop
-                while need_to_stay and (self.max_symbols is None or symbols_added < self.max_symbols):
-                    # Batch prediction and joint network steps
-                    # If very first prediction step, submit SOS tag (blank) to pred_step.
-                    # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state
-                    if time_idx == 0 and symbols_added == 0 and hidden is None:
-                        g, hidden_prime = self._pred_step(self._SOS, hidden, batch_size=batchsize)
-                    else:
-                        # Perform batch step prediction of decoder, getting new states and scores ("g")
-                        g, hidden_prime = self._pred_step(last_label, hidden, batch_size=batchsize)
-
-                    # Batched joint step - Output = [B, V + 1 + num-big-blanks]
-                    # Note: log_normalize must not be True here since the joiner output is contanetation of both token logits and duration logits,
-                    # and they need to be normalized independently.
-                    joined = self._joint_step(f, g, log_normalize=None)
-                    logp = joined[:, 0, 0, : -len(self.durations)]
-                    duration_logp = joined[:, 0, 0, -len(self.durations) :]
-
-                    if logp.dtype != torch.float32:
-                        logp = logp.float()
-                        duration_logp = duration_logp.float()
-
-                    # get the max for both token and duration predictions.
-                    v, k = logp.max(1)
-                    dv, dk = duration_logp.max(1)
-
-                    # here we set the skip value to be the minimum of all predicted durations, hense the "torch.min(dk)" call there.
-                    # Please refer to Section 5.2 of our paper https://arxiv.org/pdf/2304.06795.pdf for explanation of this.
-                    skip = self.durations[int(torch.min(dk))]
-
-                    # this is a special case: if all batches emit blanks, we require that skip be at least 1
-                    # so we don't loop forever at the current frame.
-                    if blank_mask.all():
-                        if skip == 0:
-                            skip = 1
-
-                    need_to_stay = skip == 0
-                    del g
-
-                    # Update blank mask with current predicted blanks
-                    # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U)
-                    k_is_blank = k == self._blank_index
-                    blank_mask.bitwise_or_(k_is_blank)
-
-                    del k_is_blank
-                    del logp, duration_logp
-
-                    # If all samples predict / have predicted prior blanks, exit loop early
-                    # This is equivalent to if single sample predicted k
-                    if not blank_mask.all():
-                        # Collect batch indices where blanks occurred now/past
-                        blank_indices = (blank_mask == 1).nonzero(as_tuple=False)
-
-                        # Recover prior state for all samples which predicted blank now/past
-                        if hidden is not None:
-                            hidden_prime = self.decoder.batch_copy_states(hidden_prime, hidden, blank_indices)
-
-                        elif len(blank_indices) > 0 and hidden is None:
-                            # Reset state if there were some blank and other non-blank predictions in batch
-                            # Original state is filled with zeros so we just multiply
-                            # LSTM has 2 states
-                            hidden_prime = self.decoder.batch_copy_states(hidden_prime, None, blank_indices, value=0.0)
-
-                        # Recover prior predicted label for all samples which predicted blank now/past
-                        k[blank_indices] = last_label[blank_indices, 0]
-
-                        # Update new label and hidden state for next iteration
-                        last_label = k.clone().view(-1, 1)
-                        hidden = hidden_prime
-
-                        # Update predicted labels, accounting for time mask
-                        # If blank was predicted even once, now or in the past,
-                        # Force the current predicted label to also be blank
-                        # This ensures that blanks propogate across all timesteps
-                        # once they have occured (normally stopping condition of sample level loop).
-                        for kidx, ki in enumerate(k):
-                            if blank_mask[kidx] == 0:
-                                hypotheses[kidx].y_sequence.append(ki)
-                                hypotheses[kidx].timestep.append(time_idx)
-                                hypotheses[kidx].score += float(v[kidx])
-
-                        symbols_added += 1
-
-            # Remove trailing empty list of alignments at T_{am-len} x Uj
-            if self.preserve_alignments:
-                for batch_idx in range(batchsize):
-                    if len(hypotheses[batch_idx].alignments[-1]) == 0:
-                        del hypotheses[batch_idx].alignments[-1]
-
-            # Remove trailing empty list of confidence scores at T_{am-len} x Uj
-            if self.preserve_frame_confidence:
-                for batch_idx in range(batchsize):
-                    if len(hypotheses[batch_idx].frame_confidence[-1]) == 0:
-                        del hypotheses[batch_idx].frame_confidence[-1]
-
-        # Preserve states
-        for batch_idx in range(batchsize):
-            hypotheses[batch_idx].dec_state = self.decoder.batch_select_state(hidden, batch_idx)
-
-        return hypotheses
+        raise NotImplementedError("masked greedy-batched decode is not supported for TDT models.")
 
-    def _greedy_decode_masked(
+    @torch.inference_mode()
+    def _greedy_decode_blank_as_pad_loop_labels(
         self,
         x: torch.Tensor,
         out_len: torch.Tensor,
         device: torch.device,
-        partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None,
-    ):
-        raise NotImplementedError("masked greedy-batched decode is not supported for TDT models.")
+        partial_hypotheses: Optional[list[rnnt_utils.Hypothesis]] = None,
+    ) -> list[rnnt_utils.Hypothesis]:
+        """
+        Optimized batched greedy decoding.
+        The main idea: search for next labels for the whole batch (evaluating Joint)
+        and thus always evaluate prediction network with maximum possible batch size
+        """
+        if partial_hypotheses is not None:
+            raise NotImplementedError("`partial_hypotheses` support is not implemented")
+
+        batched_hyps, alignments, last_decoder_state = self._decoding_computer(x=x, out_len=out_len)
+        hyps = rnnt_utils.batched_hyps_to_hypotheses(batched_hyps, alignments)
+        for hyp, state in zip(hyps, self.decoder.batch_split_states(last_decoder_state)):
+            hyp.dec_state = state
+        return hyps
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
new file mode 100644
index 000000000000..ce34d8362171
--- /dev/null
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from omegaconf import DictConfig, ListConfig
+
+from nemo.collections.asr.parts.utils import rnnt_utils
+from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodMixin
+
+
+class GreedyBatchedTDTLoopLabelsComputer(ConfidenceMethodMixin):
+    """
+    Loop Labels algorithm implementation. Callable.
+    """
+
+    def __init__(
+        self,
+        decoder,
+        joint,
+        blank_index: int,
+        durations: Union[list[int], ListConfig[int]],
+        max_symbols_per_step: Optional[int] = None,
+        preserve_alignments=False,
+        preserve_frame_confidence=False,
+        confidence_method_cfg: Optional[DictConfig] = None,
+    ):
+        """
+        Init method.
+        Args:
+            decoder: Prediction network from RNN-T
+            joint: Joint module from RNN-T
+            blank_index: index of blank symbol
+            durations: list of TDT durations, e.g., [0, 1, 2, 4, 8]
+            max_symbols_per_step: max symbols to emit on each step (to avoid infinite looping)
+            preserve_alignments: if alignments are needed
+            preserve_frame_confidence: if frame confidence is needed
+            confidence_method_cfg: config for the confidence
+        """
+        super().__init__()
+        self.decoder = decoder
+        self.joint = joint
+        # keep durations on CPU to avoid side effects in multi-gpu environment
+        self.durations = torch.tensor(list(durations), device="cpu").to(torch.long)
+        self._blank_index = blank_index
+        self.max_symbols = max_symbols_per_step
+        self.preserve_alignments = preserve_alignments
+        self.preserve_frame_confidence = preserve_frame_confidence
+        self._SOS = self._blank_index
+        self._init_confidence_method(confidence_method_cfg=confidence_method_cfg)
+        assert self._SOS == self._blank_index  # "blank as pad" algorithm only
+
+    def __call__(
+        self, x: torch.Tensor, out_len: torch.Tensor,
+    ) -> Tuple[rnnt_utils.BatchedHyps, Optional[rnnt_utils.BatchedAlignments], Any]:
+        """
+        Optimized batched greedy decoding.
+        Iterates over labels, on each step finding the next non-blank label
+        (evaluating Joint multiple times in inner loop); It uses a minimal possible amount of calls
+        to prediction network (with maximum possible batch size),
+        which makes it especially useful for scaling the prediction network.
+        During decoding all active hypotheses ("texts") have the same lengths.
+
+        Args:
+            x: output from the encoder
+            out_len: lengths of the utterances in `x`
+        """
+        batch_size, max_time, _unused = x.shape
+        device = x.device
+
+        x = self.joint.project_encoder(x)  # do not recalculate joint projection, project only once
+
+        # init output structures: BatchedHyps (for results), BatchedAlignments + last decoder state
+        # init empty batched hypotheses
+        batched_hyps = rnnt_utils.BatchedHyps(
+            batch_size=batch_size,
+            init_length=max_time * self.max_symbols if self.max_symbols is not None else max_time,
+            device=x.device,
+            float_dtype=x.dtype,
+        )
+        # sample state, will be replaced further when the decoding for hypothesis is done
+        last_decoder_state = self.decoder.initialize_state(x)
+        # init alignments if necessary
+        use_alignments = self.preserve_alignments or self.preserve_frame_confidence
+        # always use alignments variable - for torch.jit adaptation, but keep it as minimal as possible
+        alignments = rnnt_utils.BatchedAlignments(
+            batch_size=batch_size,
+            logits_dim=self.joint.num_classes_with_blank,
+            init_length=max_time * 2 if use_alignments else 1,  # blank for each timestep + text tokens
+            device=x.device,
+            float_dtype=x.dtype,
+            store_alignments=self.preserve_alignments,
+            store_frame_confidence=self.preserve_frame_confidence,
+        )
+
+        # durations
+        all_durations = self.durations.to(device, non_blocking=True)
+        num_durations = all_durations.shape[0]
+
+        # initial state, needed for torch.jit to compile (cannot handle None)
+        state = self.decoder.initialize_state(x)
+        # indices of elements in batch (constant)
+        batch_indices = torch.arange(batch_size, dtype=torch.long, device=device)
+        # last found labels - initially <SOS> (<blank>) symbol
+        labels = torch.full_like(batch_indices, fill_value=self._SOS)
+
+        # time indices
+        time_indices = torch.zeros_like(batch_indices)
+        safe_time_indices = torch.zeros_like(time_indices)  # time indices, guaranteed to be < out_len
+        time_indices_current_labels = torch.zeros_like(time_indices)
+        last_timesteps = out_len - 1
+
+        # masks for utterances in batch
+        active_mask: torch.Tensor = out_len > 0
+        advance_mask = torch.empty_like(active_mask)
+
+        # for storing the last state we need to know what elements became "inactive" on this step
+        active_mask_prev = torch.empty_like(active_mask)
+        became_inactive_mask = torch.empty_like(active_mask)
+
+        # loop while there are active utterances
+        first_step = True
+        while active_mask.any():
+            active_mask_prev.copy_(active_mask, non_blocking=True)
+            # stage 1: get decoder (prediction network) output
+            if first_step:
+                # start of the loop, SOS symbol is passed into prediction network, state is None
+                # we need to separate this for torch.jit
+                decoder_output, state, *_ = self.decoder.predict(
+                    labels.unsqueeze(1), None, add_sos=False, batch_size=batch_size
+                )
+                first_step = False
+            else:
+                decoder_output, state, *_ = self.decoder.predict(
+                    labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
+                )
+            decoder_output = self.joint.project_prednet(decoder_output)  # do not recalculate joint projection
+
+            # stage 2: get joint output, iteratively seeking for non-blank labels
+            # blank label in `labels` tensor means "end of hypothesis" (for this index)
+            logits = (
+                self.joint.joint_after_projection(x[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,)
+                .squeeze(1)
+                .squeeze(1)
+            )
+            scores, labels = logits[:, :-num_durations].max(dim=-1)
+            jump_durations_indices = logits[:, -num_durations:].argmax(dim=-1)
+            durations = all_durations[jump_durations_indices]
+
+            # search for non-blank labels using joint, advancing time indices for blank labels
+            # checking max_symbols is not needed, since we already forced advancing time indices for such cases
+            blank_mask = labels == self._blank_index
+            # for blank labels force duration >= 1
+            durations.masked_fill_(torch.logical_and(durations == 0, blank_mask), 1)
+            time_indices_current_labels.copy_(time_indices, non_blocking=True)
+            if use_alignments:
+                alignments.add_results_masked_(
+                    active_mask=active_mask,
+                    time_indices=time_indices_current_labels,
+                    logits=logits if self.preserve_alignments else None,
+                    labels=labels if self.preserve_alignments else None,
+                    confidence=self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1))
+                    if self.preserve_frame_confidence
+                    else None,
+                )
+
+            # advance_mask is a mask for current batch for searching non-blank labels;
+            # each element is True if non-blank symbol is not yet found AND we can increase the time index
+            time_indices += durations
+            torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
+            torch.less(time_indices, out_len, out=active_mask)
+            torch.logical_and(active_mask, blank_mask, out=advance_mask)
+            # inner loop: find next non-blank labels (if exist)
+            while advance_mask.any():
+                # same as: time_indices_current_labels[advance_mask] = time_indices[advance_mask], but non-blocking
+                # store current time indices to use further for storing the results
+                torch.where(advance_mask, time_indices, time_indices_current_labels, out=time_indices_current_labels)
+                logits = (
+                    self.joint.joint_after_projection(
+                        x[batch_indices, safe_time_indices].unsqueeze(1), decoder_output,
+                    )
+                    .squeeze(1)
+                    .squeeze(1)
+                )
+                # get labels (greedy) and scores from current logits, replace labels/scores with new
+                # labels[advance_mask] are blank, and we are looking for non-blank labels
+                more_scores, more_labels = logits[:, :-num_durations].max(dim=-1)
+                # same as: labels[advance_mask] = more_labels[advance_mask], but non-blocking
+                torch.where(advance_mask, more_labels, labels, out=labels)
+                # same as: scores[advance_mask] = more_scores[advance_mask], but non-blocking
+                torch.where(advance_mask, more_scores, scores, out=scores)
+                jump_durations_indices = logits[:, -num_durations:].argmax(dim=-1)
+                durations = all_durations[jump_durations_indices]
+
+                if use_alignments:
+                    alignments.add_results_masked_(
+                        active_mask=advance_mask,
+                        time_indices=time_indices_current_labels,
+                        logits=logits if self.preserve_alignments else None,
+                        labels=more_labels if self.preserve_alignments else None,
+                        confidence=self._get_confidence_tensor(F.log_softmax(logits[:, :-num_durations], dim=-1))
+                        if self.preserve_frame_confidence
+                        else None,
+                    )
+
+                blank_mask = labels == self._blank_index
+                # for blank labels force duration >= 1
+                durations.masked_fill_(torch.logical_and(durations == 0, blank_mask), 1)
+                # same as time_indices[advance_mask] += durations[advance_mask], but non-blocking
+                torch.where(advance_mask, time_indices + durations, time_indices, out=time_indices)
+                torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
+                torch.less(time_indices, out_len, out=active_mask)
+                torch.logical_and(active_mask, blank_mask, out=advance_mask)
+
+            # stage 3: filter labels and state, store hypotheses
+            # select states for hyps that became inactive (is it necessary?)
+            # this seems to be redundant, but used in the `loop_frames` output
+            torch.ne(active_mask, active_mask_prev, out=became_inactive_mask)
+            self.decoder.batch_replace_states_mask(
+                src_states=state, dst_states=last_decoder_state, mask=became_inactive_mask,
+            )
+
+            # store hypotheses
+            if self.max_symbols is not None:
+                # pre-allocated memory, no need for checks
+                batched_hyps.add_results_masked_no_checks_(
+                    active_mask, labels, time_indices_current_labels, scores,
+                )
+            else:
+                # auto-adjusted storage
+                batched_hyps.add_results_masked_(
+                    active_mask, labels, time_indices_current_labels, scores,
+                )
+
+            # stage 4: to avoid looping, go to next frame after max_symbols emission
+            if self.max_symbols is not None:
+                # if labels are non-blank (not end-of-utterance), check that last observed timestep with label:
+                # if it is equal to the current time index, and number of observations is >= max_symbols, force blank
+                force_blank_mask = torch.logical_and(
+                    active_mask,
+                    torch.logical_and(
+                        torch.logical_and(
+                            labels != self._blank_index, batched_hyps.last_timestep_lasts >= self.max_symbols,
+                        ),
+                        batched_hyps.last_timestep == time_indices,
+                    ),
+                )
+                time_indices += force_blank_mask  # emit blank => advance time indices
+                # update safe_time_indices, non-blocking
+                torch.minimum(time_indices, last_timesteps, out=safe_time_indices)
+                # same as: active_mask = time_indices < out_len
+                torch.less(time_indices, out_len, out=active_mask)
+        if use_alignments:
+            return batched_hyps, alignments, last_decoder_state
+        return batched_hyps, None, last_decoder_state

From 84c6c5a87caab6b7ee91f0bd94222d4ecbfc384d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 16 Feb 2024 10:15:54 -0700
Subject: [PATCH 15/28] Add dist ckpt support for regular optimizers (#7749)
 (#8293)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add dist ckpt support for regular optimizers

* [tutorial] fixed missing RIR scripts file. (#8257)

* fix imports

* imports fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook

* revert asr notebook

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py   | 19 +++++++++++++++++--
 nemo/core/optim/optimizer_with_main_params.py | 13 -------------
 nemo/core/optim/optimizers.py                 | 15 +++++++++++++++
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 1012fdf71405..cde0188dff20 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -60,12 +60,14 @@
 from nemo.collections.nlp.parts import utils_funcs
 from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.core.optim import MainParamsOptimizerWrapper
+from nemo.core.optim.optimizers import init_optimizer_states
 from nemo.utils import AppState, logging
 from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.model_utils import ckpt_to_dir, inject_model_parallel_rank, uninject_model_parallel_rank
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+    from nemo.core.optim.distributed_adam import MegatronDistributedFusedAdam
 
     HAVE_APEX = True
 
@@ -259,7 +261,7 @@ def optimizer_sharded_state_dict(self):
             ValueError: If a parameter ID does not match any model sharded parameter.
         """
 
-        optimizer = self.lightning_module.optimizers(use_pl_optimizer=False)  # MainParamsOptimizerWrapper
+        optimizer = self.lightning_module.optimizers(use_pl_optimizer=False)
 
         model_sharded_state_dict = self.lightning_module.sharded_state_dict()
 
@@ -268,8 +270,21 @@ def optimizer_sharded_state_dict(self):
             key: value for key, value in model_sharded_state_dict.items() if not key.endswith('_extra_state')
         }
 
-        if not isinstance(optimizer, MainParamsOptimizerWrapper):
+        if isinstance(optimizer, MegatronDistributedFusedAdam):
             return optimizer.sharded_state_dict(model_sharded_state_dict)
+        elif not isinstance(optimizer, MainParamsOptimizerWrapper):
+            # Regular optimizer, e.g. Adam or FusedAdam
+            init_optimizer_states(optimizer)
+            optimizer_state_dict = optimizer.state_dict()
+            id_to_sharded_param_map = get_param_id_to_sharded_param_map(
+                model_sharded_state_dict=model_sharded_state_dict,
+                optim_params_iter=itertools.chain.from_iterable(g['params'] for g in optimizer.param_groups),
+            )
+            optim_state_to_sharding_state(optimizer_state_dict, id_to_sharded_param_map)
+            return optimizer_state_dict
+
+        # MainParamsOptimizerWrapper
+        init_optimizer_states(optimizer.optimizer)
 
         optimizer_state_dict = optimizer.state_dict()
 
diff --git a/nemo/core/optim/optimizer_with_main_params.py b/nemo/core/optim/optimizer_with_main_params.py
index 680b82ed7201..7f8794f746df 100644
--- a/nemo/core/optim/optimizer_with_main_params.py
+++ b/nemo/core/optim/optimizer_with_main_params.py
@@ -312,9 +312,6 @@ def __init__(
             self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group)
             self.fp32_from_fp32_groups.append(fp32_params_this_group)
 
-        # init exp_avg and exp_avg_sq before loading optimizer state, needed for dist checkpointing
-        self._init_opt_state()
-
         # Leverage state_dict() and load_state_dict() to
         # recast preexisting per-param state tensors
         self.optimizer.load_state_dict(self.optimizer.state_dict())
@@ -543,13 +540,3 @@ def _set_defaults(self, value):
         self.optimizer.defaults = value
 
     defaults = property(_get_defaults, _set_defaults)
-
-    def _init_opt_state(self):
-        """
-        Initialize the optimizer state with zero tensors for 'exp_avg' and 'exp_avg_sq' of each parameter.
-        """
-        for group in self.optimizer.param_groups:
-            for p in group['params']:
-                if len(self.optimizer.state[p]) == 0:
-                    self.optimizer.state[p]['exp_avg'] = torch.zeros_like(p.data)
-                    self.optimizer.state[p]['exp_avg_sq'] = torch.zeros_like(p.data)
diff --git a/nemo/core/optim/optimizers.py b/nemo/core/optim/optimizers.py
index 1d52a9bf10f8..2cc6be0dfc23 100644
--- a/nemo/core/optim/optimizers.py
+++ b/nemo/core/optim/optimizers.py
@@ -200,3 +200,18 @@ def get_optimizer(name: str, **kwargs: Optional[Dict[str, Any]]) -> Optimizer:
     optimizer = AVAILABLE_OPTIMIZERS[name]
     optimizer = partial(optimizer, **kwargs)
     return optimizer
+
+
+def init_optimizer_states(optimizer: Optimizer):
+    adam_nondist_optims = (optim.Adam, optim.AdamW)
+    if HAVE_APEX:
+        adam_nondist_optims += (FusedAdam,)
+    if isinstance(optimizer, adam_nondist_optims):
+        for group in optimizer.param_groups:
+            for p in group['params']:
+                state = optimizer.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
+                    if group.get('amsgrad'):
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)

From 8d3ddb967a68d2034bf7cf8cd2891dca5cea6d7a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 16 Feb 2024 10:16:16 -0700
Subject: [PATCH 16/28] Multimodal r1.23.0 bug fix  (#8315) (#8339)

* Rename quick-gelu

* ddpm config guard

* Fix ddpm edit api

* Fix insert_image_token cfg issue

* neva updates

* reformat

* Add back jenkins

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs

* Update default neva template

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 Jenkinsfile                                   | 366 +++++++++---------
 .../multimodal_llm/neva/neva_evaluation.py    |   5 +-
 .../clip/conf/megatron_clip_VIT-L-14.yaml     |   4 +-
 .../nsfw/conf/megatron_nsfw_config.yaml       |   4 +-
 .../multimodal/data/neva/neva_dataset.py      |   2 +-
 .../models/multimodal_llm/neva/neva_model.py  |   8 +-
 .../instruct_pix2pix/ldm/ddpm_edit.py         |   4 +-
 .../stable_diffusion/ldm/ddpm.py              |   6 +-
 8 files changed, 202 insertions(+), 197 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 474db51efdc8..c2357e280afb 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -126,72 +126,74 @@ pipeline {
         sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat'
       }
     }
-//
-//     stage('L2: Multimodal Imagen Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/imagen_train"
-//         sh "pip install webdataset==0.2.48"
-//         sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \
-//         trainer.precision=16 \
-//         trainer.num_nodes=1 \
-//         trainer.devices=1 \
-//         ++exp_manager.max_time_per_run=00:00:03:00 \
-//         trainer.max_steps=20 \
-//         model.micro_batch_size=1 \
-//         model.global_batch_size=1 \
-//         model.data.synthetic_data=True \
-//         exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \
-//         model.inductor=False \
-//         model.unet.flash_attention=False \
-//         "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-//         sh "rm -rf /home/TestData/multimodal/imagen_train"
-//       }
-//     }
-//
-//     stage('L2: Multimodal Stable Diffusion Train') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
-//         sh "pip install webdataset==0.2.48"
-//         sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-//             trainer.precision=16 \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             model.data.synthetic_data=True \
-//             exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \
-//             model.inductor=False \
-//             model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-//             ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-//             ++model.cond_stage_config.max_length=77 \
-//             ~model.cond_stage_config.restore_from_path \
-//             ~model.cond_stage_config.freeze \
-//             ~model.cond_stage_config.layer \
-//             model.unet_config.from_pretrained=null \
-//             model.first_stage_config.from_pretrained=null \
-//             model.unet_config.use_flash_attention=False \
-//             "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-//         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
-//       }
-//     }
+
+    stage('L2: Multimodal Imagen Train') {
+      when {
+        anyOf {
+          branch 'r1.23.0'
+          changeRequest target: 'r1.23.0'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/imagen_train"
+        sh "pip install webdataset==0.2.48"
+        sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \
+        trainer.precision=16 \
+        trainer.num_nodes=1 \
+        trainer.devices=1 \
+        ++exp_manager.max_time_per_run=00:00:03:00 \
+        trainer.max_steps=20 \
+        model.conditioning.embed_dim=64 \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.synthetic_data=True \
+        exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \
+        model.inductor=False \
+        model.unet.flash_attention=False \
+        "
+        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+        sh "rm -rf /home/TestData/multimodal/imagen_train"
+      }
+    }
+    stage('L2: Multimodal Stable Diffusion Train') {
+      when {
+        anyOf {
+          branch 'r1.23.0'
+          changeRequest target: 'r1.23.0'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
+        sh "pip install webdataset==0.2.48"
+        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+            trainer.precision=16 \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            model.data.synthetic_data=True \
+            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \
+            model.inductor=False \
+            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+            ++model.cond_stage_config.max_length=77 \
+            ~model.cond_stage_config.restore_from_path \
+            ~model.cond_stage_config.freeze \
+            ~model.cond_stage_config.layer \
+            model.unet_config.from_pretrained=null \
+            model.first_stage_config.from_pretrained=null \
+            model.unet_config.use_flash_attention=False \
+            model.unet_config.attention_resolutions=[1] \
+            model.unet_config.channel_mult=[1] \
+            "
+        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
+      }
+    }
 //     stage('L2: Multimodal ControlNet Train') {
 //       when {
 //         anyOf {
@@ -260,122 +262,122 @@ pipeline {
 //         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
 //       }
 //     }
-//     stage('L2: Vision ViT Pretrain TP=1') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
-//         sh "pip install webdataset==0.2.48"
-//         sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \
-//             trainer.precision=16 \
-//             model.megatron_amp_O2=False \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             trainer.val_check_interval=5 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=2 \
-//             model.global_batch_size=4 \
-//             model.tensor_model_parallel_size=1 \
-//             model.pipeline_model_parallel_size=1 \
-//             model.data.num_workers=0 \
-//             exp_manager.create_checkpoint_callback=False \
-//             model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \
-//             exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-//         sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
-//       }
-//     }
-//
-//     stage('L2: Multimodal CLIP Pretrain TP=1') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
-//         sh "pip install webdataset==0.2.48"
-//         sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py  \
-//             trainer.precision=16 \
-//             model.megatron_amp_O2=False \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             trainer.val_check_interval=10 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=1 \
-//             model.global_batch_size=1 \
-//             model.tensor_model_parallel_size=1 \
-//             model.pipeline_model_parallel_size=1 \
-//             exp_manager.create_checkpoint_callback=False \
-//             model.data.num_workers=0 \
-//             model.vision.num_layers=2 \
-//             model.text.num_layers=2 \
-//             model.vision.patch_dim=32 \
-//             model.vision.encoder_seq_length=49 \
-//             model.vision.class_token_length=7 \
-//             model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
-//             model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
-//             model.data.webdataset.local_root_path=/ \
-//             exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-//         sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
-//       }
-//     }
-//
-//     stage('L2: Multimodal NeVA Pretrain TP=1') {
-//       when {
-//         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
-//         }
-//       }
-//       failFast true
-//       steps {
-//         sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
-//         sh "pip install webdataset==0.2.48"
-//         sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \
-//             trainer.precision=bf16 \
-//             model.megatron_amp_O2=False \
-//             trainer.num_nodes=1 \
-//             trainer.devices=1 \
-//             trainer.val_check_interval=10 \
-//             trainer.limit_val_batches=5 \
-//             trainer.log_every_n_steps=1 \
-//             ++exp_manager.max_time_per_run=00:00:03:00 \
-//             trainer.max_steps=20 \
-//             model.micro_batch_size=2 \
-//             model.global_batch_size=4 \
-//             model.tensor_model_parallel_size=1 \
-//             model.pipeline_model_parallel_size=1 \
-//             exp_manager.create_checkpoint_callback=False \
-//             model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \
-//             model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \
-//             model.tokenizer.library=sentencepiece \
-//             model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \
-//             model.num_layers=2 \
-//             model.hidden_size=5120 \
-//             model.ffn_hidden_size=13824 \
-//             model.num_attention_heads=40 \
-//             model.normalization=rmsnorm \
-//             model.data.num_workers=0 \
-//             model.data.conv_template=llama_2 \
-//             model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \
-//             model.mm_cfg.llm.from_pretrained=null \
-//             model.use_flash_attention=false \
-//             exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 "
-//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
-//         sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
-//       }
-//     }
+    stage('L2: Vision ViT Pretrain TP=1') {
+      when {
+        anyOf {
+          branch 'r1.23.0'
+          changeRequest target: 'r1.23.0'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
+        sh "pip install webdataset==0.2.48"
+        sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \
+            trainer.precision=16 \
+            model.megatron_amp_O2=False \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            trainer.val_check_interval=5 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            trainer.max_steps=20 \
+            model.micro_batch_size=2 \
+            model.global_batch_size=4 \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.data.num_workers=0 \
+            exp_manager.create_checkpoint_callback=False \
+            model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \
+            exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 "
+        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+        sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
+      }
+    }
+
+    stage('L2: Multimodal CLIP Pretrain TP=1') {
+      when {
+        anyOf {
+          branch 'r1.23.0'
+          changeRequest target: 'r1.23.0'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
+        sh "pip install webdataset==0.2.48"
+        sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py  \
+            trainer.precision=16 \
+            model.megatron_amp_O2=False \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            trainer.val_check_interval=10 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            exp_manager.create_checkpoint_callback=False \
+            model.data.num_workers=0 \
+            model.vision.num_layers=2 \
+            model.text.num_layers=2 \
+            model.vision.patch_dim=32 \
+            model.vision.encoder_seq_length=49 \
+            model.vision.class_token_length=7 \
+            model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
+            model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
+            model.data.webdataset.local_root_path=/ \
+            exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 "
+        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+        sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
+      }
+    }
+
+    stage('L2: Multimodal NeVA Pretrain TP=1') {
+      when {
+        anyOf {
+          branch 'r1.23.0'
+          changeRequest target: 'r1.23.0'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
+        sh "pip install webdataset==0.2.48"
+        sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \
+            trainer.precision=16 \
+            model.megatron_amp_O2=False \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=5 \
+            trainer.log_every_n_steps=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            trainer.max_steps=20 \
+            model.micro_batch_size=2 \
+            model.global_batch_size=4 \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            exp_manager.create_checkpoint_callback=False \
+            model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \
+            model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \
+            model.tokenizer.library=sentencepiece \
+            model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \
+            model.num_layers=2 \
+            model.hidden_size=5120 \
+            model.ffn_hidden_size=13824 \
+            model.num_attention_heads=40 \
+            model.normalization=rmsnorm \
+            model.data.num_workers=0 \
+            model.data.conv_template=llama_2 \
+            model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \
+            model.mm_cfg.llm.from_pretrained=null \
+            model.use_flash_attention=false \
+            exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 "
+        sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+        sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
+      }
+    }
 
     // TODO: this requires TE >= v0.11 which is not available in 23.06.
     //        please uncomment this test once mcore CI is ready.
diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
index 545a634ac7fb..bd3f975e4d54 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -71,15 +71,16 @@ def main(cfg) -> None:
     with open(cfg.prompt_file, 'r') as f:
         lines = f.readlines()
 
+    insert_image_token = cfg.inference.get("insert_image_token", None)
     final_prompts = []
     for line in lines:
         prompt_dict = json.loads(line)
         assert 'prompt' in prompt_dict or 'text' in prompt_dict
         if 'prompt' not in prompt_dict:
             prompt_dict['prompt'] = prompt_dict['text']
-        if cfg.inference.insert_image_token == 'left':
+        if insert_image_token == 'left':
             prompt_dict['prompt'] = '<image>' + prompt_dict['prompt']
-        elif cfg.inference.insert_image_token == 'right':
+        elif insert_image_token == 'right':
             prompt_dict['prompt'] = prompt_dict['prompt'] + '<image>'
         if 'image' in prompt_dict:
             prompt_dict['image_path'] = prompt_dict['image']
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
index 8a21fccd0874..d8740bb98eb2 100644
--- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
@@ -79,7 +79,7 @@ model:
     openai_gelu: False
     bias_activation_fusion: False
     megatron_legacy: True
-    activation: quick-gelu
+    activation: approx-gelu
 
 
 
@@ -144,7 +144,7 @@ model:
     fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
     fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
     use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
-    activation: quick-gelu
+    activation: approx-gelu
 
   # Megatron O2-style half-precision
   megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
diff --git a/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml b/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml
index 11dc65155cf5..be820e8d731d 100644
--- a/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml
+++ b/examples/multimodal/vision_language_foundation/nsfw/conf/megatron_nsfw_config.yaml
@@ -117,7 +117,7 @@ model:
     openai_gelu: false
     bias_activation_fusion: false
     megatron_legacy: true
-    activation: quick-gelu
+    activation: approx-gelu
 
   text:
     precision: ${trainer.precision}
@@ -171,7 +171,7 @@ model:
     fp8_amax_history_len: 1
     fp8_amax_compute_algo: most_recent
     use_emha: false
-    activation: quick-gelu
+    activation: approx-gelu
 
   # Megatron O2-style half-precision
   megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 4dd6b120c8c8..90f862869369 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -397,7 +397,7 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
                 if 'label' not in turn:
                     turn[
                         'label'
-                    ] = "quality:6,toxicity:0,humor:0,creativity:0,violence:0,helpfulness:6,not_appropriate:0"
+                    ] = "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4"
                 value = DEFAULT_LABELS_TOKEN + turn['label'] + '\n' + turn['value']
                 conv.append_message(turn['from'], value)
                 if not turn["value"]:
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 5fd0fa830dd0..3f4156d0fa73 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -612,8 +612,8 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None)
         output_tensor = self.model(**forward_args)
         return output_tensor
 
-    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
-        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only)
+    def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step=None):
+        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only, first_val_step)
 
     def training_step(self, dataloader_iter, batch_idx):
         """
@@ -623,7 +623,7 @@ def training_step(self, dataloader_iter, batch_idx):
         """
         return MegatronGPTModel.training_step(self, dataloader_iter, batch_idx)
 
-    def get_forward_output_and_loss_func(self, validation_step=False):
+    def get_forward_output_and_loss_func(self, validation_step=False, tuning=False):
         def loss_func(output_tensor, loss_mask):
             loss_for_ub = self.loss_func(loss_mask, output_tensor)
             if validation_step and not self.cfg.data.get('validation_drop_last', True):
@@ -921,7 +921,7 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         Returns:
             List of available pre-trained models.
         """
-        return []
+        return None
 
     def setup_test_data(self, cfg):
         pass
diff --git a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py
index 901745f09421..9bb490fb8fc8 100644
--- a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py
+++ b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py
@@ -40,7 +40,9 @@
 
 
 class LatentDiffusionEdit(LatentDiffusion):
-    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+    def init_from_ckpt(
+        self, path, ignore_keys=list(), only_model=False, load_vae=True, load_unet=True, load_encoder=True,
+    ):
         pl_sd = torch.load(path, map_location="cpu")
         if "state_dict" in list(pl_sd.keys()):
             pl_sd = pl_sd["state_dict"]
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 31b56443846f..36dfb74fbfaf 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -557,9 +557,9 @@ def __init__(self, cfg, model_parallel_config):
 
         self.restarted_from_ckpt = False
         if ckpt_path is not None:
-            load_vae = True if cfg.load_vae is None else cfg.load_vae
-            load_unet = True if cfg.load_unet is None else cfg.load_unet
-            load_encoder = True if cfg.load_encoder is None else cfg.load_encoder
+            load_vae = True if cfg.get("load_vae", None) is None else cfg.load_vae
+            load_unet = True if cfg.get("load_unet", None) is None else cfg.load_unet
+            load_encoder = True if cfg.get("load_encoder", None) is None else cfg.load_encoder
 
             self.init_from_ckpt(
                 ckpt_path, ignore_keys, load_vae=load_vae, load_unet=load_unet, load_encoder=load_encoder,

From 9231b83575282fdd4be71c65bc521607976960ea Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 16 Feb 2024 10:16:40 -0700
Subject: [PATCH 17/28] mcore ds fix (#8283) (#8385)

* [tutorial] fixed missing RIR scripts file. (#8257)

* add values to en tts dict (#7879)

* mcore ds fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

* revert asr files

* add comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

* update mcore version

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

* update mcore commit

* fix Bert unit tests

* update bert tests

* fix bert mcore test

* fix gpt jenkins tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

* revert apex installation

* turn off the fusion for jenkins

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 Dockerfile                                       |  8 ++++----
 Jenkinsfile                                      | 16 +++++++++++-----
 .../conf/megatron_gpt_config.yaml                |  3 ++-
 .../tokenizers/huggingface/auto_tokenizer.py     |  9 +++++++++
 .../language_modeling/megatron_gpt_model.py      |  9 ++++++---
 nemo/core/config/hydra_runner.py                 |  3 +++
 6 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6a5c48bee4c4..ec3e5dd87382 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.12-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
 
 # build an image that includes only the nemo dependencies, ensures that dependencies
 # are included first for optimal caching, and useful for building a development
@@ -66,19 +66,19 @@ WORKDIR /workspace/
 # We leave it here in case we need to work off of a specific commit in main
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout 27cbe46714a50c43ed290f1b1472db8d2780c55c && \
+  git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
   pip install .
 
 # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout b496d85fb88a801d8e680872a12822de310951fd && \
+  git checkout f058162b215791b15507bb542f22ccfde49c872d && \
   pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
 
 # Transformer Engine 1.2.0
 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
   cd TransformerEngine && \
-  git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \
+  git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
   git checkout FETCH_HEAD && \
   git submodule init && git submodule update && \
   NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
diff --git a/Jenkinsfile b/Jenkinsfile
index c2357e280afb..957b69e13c17 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,10 +1,16 @@
 pipeline {
   agent {
         docker {
-          image 'nvcr.io/nvidia/pytorch:23.12-py3'
+          image 'nvcr.io/nvidia/pytorch:24.01-py3'
           args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
         }
   }
+
+  environment {
+        NVTE_FUSED_ATTN = 0
+        NVTE_FLASH_ATTN = 0
+  }
+
   options {
     timeout(time: 8, unit: 'HOURS')
     disableConcurrentBuilds(abortPrevious: true)
@@ -62,7 +68,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
              cd TransformerEngine && \
-             git fetch origin 4f9662fbe621671f5f905e772fc1138953af77f6 && \
+             git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
              git checkout FETCH_HEAD && \
              git submodule init && git submodule update && \
              NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
@@ -85,7 +91,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout bed60a881f4b238b1c14b6c6a64997cc636e77b6 && \
+             git checkout 240a8ef7a21df201e47b5b2ae33cc5f4c5486849 && \
              pip install .'
       }
     }
@@ -3227,7 +3233,7 @@ pipeline {
       }
       failFast true
       steps {
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
@@ -3257,7 +3263,7 @@ pipeline {
         model.activations_checkpoint_num_layers=1 \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings"
-        sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        sh "NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 004e8b584a13..aaa00df2e006 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -204,7 +204,7 @@ model:
 
   ##Offloading Activations/Weights to CPU
   cpu_offloading: False
-  cpu_offloading_num_layers: 11 #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer
+  cpu_offloading_num_layers: ${sum:${.num_layers},-1} #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer
   cpu_offloading_activations: True
   cpu_offloading_weights: True
 
@@ -247,6 +247,7 @@ model:
     pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
     shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
     exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem
+    mock_dataset: False # Set to True and data_prefix to None to use artificially generated mock dataset
 
   # Nsys profiling options
   nsys_profile:
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index e6e5840d93b9..4ed5dc07dbff 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import OrderedDict
 from typing import Optional
 
 from transformers import AutoTokenizer as AUTOTOKENIZER
@@ -121,6 +122,9 @@ def __init__(
             if token is not None and token not in self.tokenizer.get_vocab():
                 new_tokens_in_vocab.append(token)
 
+        # value is required for megatron-core
+        self.unique_identifiers = OrderedDict()
+
         if len(new_tokens_in_vocab) > 0:
             """
             Special tokens that were not previously included in the tokenizer's vocabulary file will be added to 
@@ -227,6 +231,11 @@ def bos_id(self):
     def eos_id(self):
         return self.tokens_to_ids([getattr(self, 'eos_token')])[0]
 
+    @property
+    def eod(self):
+        """Returns EOS token id. Exact copy of the eos_id function. Required for megatron-core."""
+        return self.tokens_to_ids([getattr(self, 'eos_token')])[0]
+
     @property
     def sep_id(self):
         return self.tokens_to_ids([getattr(self, 'sep_token')])[0]
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 2770090a7c1e..752696ac8faa 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -85,7 +85,7 @@
 try:
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-    from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
+    from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
@@ -1199,16 +1199,18 @@ def build_train_valid_test_datasets(self):
                 1
             ] = 1  # This is to make sure we only have one epoch on every validation iteration
 
+        mock_dataset = self.cfg.data.get("mock_dataset", False)
         kwargs = {
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,
             "sequence_length": self.cfg.data.seq_length,
             "split": self.cfg.data.splits_string,
             "path_to_cache": self.cfg.data.index_mapping_dir,
+            "tokenizer": self.tokenizer,
             "reset_position_ids": self.reset_position_ids,
             "reset_attention_mask": self.reset_attention_mask,
             "eod_mask_loss": self.eod_mask_loss,
-            "eod_id": self.tokenizer.eos_id,
+            "mock": mock_dataset,
         }
 
         if isinstance(self.cfg.data.data_prefix, DictConfig):
@@ -1225,9 +1227,10 @@ def build_train_valid_test_datasets(self):
             ).build()
         else:
             dataset_config = GPTDatasetConfig(**kwargs)
+            dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
             self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                GPTDataset, train_valid_test_num_samples, dataset_config,
+                dataset_type, train_valid_test_num_samples, dataset_config,
             ).build()
 
         if self._train_ds is not None:
diff --git a/nemo/core/config/hydra_runner.py b/nemo/core/config/hydra_runner.py
index 604d2134f66b..c3c5486d7408 100644
--- a/nemo/core/config/hydra_runner.py
+++ b/nemo/core/config/hydra_runner.py
@@ -47,6 +47,9 @@ def _get_gpu_name():
 # multiple interpolated values in the config
 OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True)
 
+# sum interpolated values in the config
+OmegaConf.register_new_resolver("sum", lambda x, y: x + y, replace=True)
+
 
 def hydra_runner(
     config_path: Optional[str] = ".", config_name: Optional[str] = None, schema: Optional[Any] = None

From 1e119a980a2ab026a6c1d2cc36e4b7528fbe76a1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 16 Feb 2024 10:19:42 -0700
Subject: [PATCH 18/28] MCore dataset compatibility for tokenizers (#8390)
 (#8397)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../common/tokenizers/tokenizer_spec.py       | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/nemo/collections/common/tokenizers/tokenizer_spec.py b/nemo/collections/common/tokenizers/tokenizer_spec.py
index 252571d76ef2..f6e905d75c3b 100644
--- a/nemo/collections/common/tokenizers/tokenizer_spec.py
+++ b/nemo/collections/common/tokenizers/tokenizer_spec.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
+from collections import OrderedDict
 from typing import List
 
 __all__ = ['TokenizerSpec']
@@ -53,3 +54,60 @@ def add_special_tokens(self, special_tokens: List[str]):
     @property
     def name(self):
         return type(self).__name__
+
+    @property
+    def unique_identifiers(self):
+        """Property required for use with megatron-core datasets."""
+        return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"})
+
+    @property
+    def cls(self):
+        """Property alias to match MegatronTokenizer; returns cls_id if available."""
+        if hasattr(self, 'cls_id'):
+            return self.cls_id
+        raise AttributeError(f"{type(self).__name__} has no attribute 'cls' or 'cls_id'")
+
+    @property
+    def sep(self):
+        """Property alias to match MegatronTokenizer; returns sep_id if available."""
+        if hasattr(self, 'sep_id'):
+            return self.sep_id
+        raise AttributeError(f"{type(self).__name__} has no attribute 'sep' or 'sep_id'")
+
+    @property
+    def pad(self):
+        """Property alias to match MegatronTokenizer; returns pad_id if available."""
+        if hasattr(self, 'pad_id'):
+            return self.pad_id
+        raise AttributeError(f"{type(self).__name__} has no attribute 'pad' or 'pad_id'")
+
+    @property
+    def eod(self):
+        """Property alias to match MegatronTokenizer; returns eod_id if available."""
+        if hasattr(self, 'eod_id'):
+            return self.eod_id
+        if hasattr(self, 'eos_id'):
+            # Default to end-of-sentence id if end-of-document is not defined.
+            return self.eos_id
+        raise AttributeError(f"{type(self).__name__} has no attribute 'eod', 'eod_id', 'eos', or 'eos_id'")
+
+    @property
+    def bos(self):
+        """Property alias to match MegatronTokenizer; returns bos_id if available."""
+        if hasattr(self, 'bos_id'):
+            return self.bos_id
+        raise AttributeError(f"{type(self).__name__} has no attribute 'bos' or 'bos_id'")
+
+    @property
+    def eos(self):
+        """Property alias to match MegatronTokenizer; returns eos_id if available."""
+        if hasattr(self, 'eos_id'):
+            return self.eos_id
+        raise AttributeError(f"{type(self).__name__} has no attribute 'eos' or 'eos_id'")
+
+    @property
+    def mask(self):
+        """Property alias to match MegatronTokenizer; returns mask_id if available."""
+        if hasattr(self, 'mask_id'):
+            return self.mask_id
+        raise AttributeError(f"{type(self).__name__} has no attribute 'mask' or 'mask_id'")

From ef17d91a96f3db89e348b1eb7145a45c95205e31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Fri, 16 Feb 2024 14:50:54 -0500
Subject: [PATCH 19/28] Canary: inference tokenization improvements; preserving
 custom keys when creating tarred manifests (#8432)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Improvements for Canary:

- carry over custom keys when creatin tarred manifests
- selectable text field in ASR eval
- get rid of prompt slicing, create proper inference prompts

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set ensure_ascii=False in tarred conversion to avoid breaking tokenizers trained on UTF-8 encoding

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 examples/asr/speech_to_text_eval.py           | 16 +++----
 .../asr/data/audio_to_text_lhotse_prompted.py | 48 +++++++++++++------
 .../asr/models/aed_multitask_models.py        | 33 +++++--------
 .../common/data/lhotse/nemo_adapters.py       |  4 +-
 .../convert_to_tarred_audio_dataset.py        | 26 +++-------
 .../common/test_lhotse_dataloading.py         | 13 +++--
 6 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py
index 9e24f0172208..7b59ffe3fbfc 100644
--- a/examples/asr/speech_to_text_eval.py
+++ b/examples/asr/speech_to_text_eval.py
@@ -25,13 +25,13 @@
 for full list of arguments >>
 
     dataset_manifest: Required - path to dataset JSON manifest file (in NeMo format)
-    output_filename: Optional - output filename where the transcriptions will be written. (if scores_per_sample=True, 
+    output_filename: Optional - output filename where the transcriptions will be written. (if scores_per_sample=True,
     metrics per sample will be written there too)
 
     use_cer: Bool, whether to compute CER or WER
-    use_punct_er: Bool, compute dataset Punctuation Error Rate (set the punctuation marks for metrics computation with 
+    use_punct_er: Bool, compute dataset Punctuation Error Rate (set the punctuation marks for metrics computation with
     "text_processing.punctuation_marks")
-     
+
     tolerance: Float, minimum WER/CER required to pass some arbitrary tolerance.
 
     only_score_manifest: Bool, when set will skip audio transcription and just calculate WER of provided manifest.
@@ -141,13 +141,13 @@ def main(cfg: EvaluationConfig):
         for line in f:
             data = json.loads(line)
 
-            if 'pred_text' not in data:
+            if "pred_text" not in data:
                 invalid_manifest = True
                 break
 
-            ground_truth_text.append(data['text'])
+            ground_truth_text.append(data[cfg.gt_text_attr_name])
 
-            predicted_text.append(data['pred_text'])
+            predicted_text.append(data["pred_text"])
 
     pc = PunctuationCapitalization(cfg.text_processing.punctuation_marks)
     if cfg.text_processing.separate_punctuation:
@@ -183,7 +183,7 @@ def main(cfg: EvaluationConfig):
 
         samples_with_metrics = compute_metrics_per_sample(
             manifest_path=cfg.dataset_manifest,
-            reference_field="text",
+            reference_field=cfg.gt_text_attr_name,
             hypothesis_field="pred_text",
             metrics=metrics_to_compute,
             punctuation_marks=cfg.text_processing.punctuation_marks,
@@ -207,7 +207,7 @@ def main(cfg: EvaluationConfig):
 
         logging.info(f'Got {metric_name} of {metric_value}. Tolerance was {cfg.tolerance}')
 
-    logging.info(f'Dataset WER/CER ' + str(round(100 * wer, 2)) + "%/" + str(round(100 * cer, 2)) + "%")
+    logging.info(f"Dataset WER/CER {wer:.2%}/{cer:.2%}")
 
     if cfg.use_punct_er:
         dper_obj.print()
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index 7112224d97ee..834711d937bd 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -41,18 +41,22 @@ class PromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):
     """
 
     def __init__(
-        self, tokenizer: TokenizerSpec, prompt_format_fn: Callable[[CutSet, TokenizerWrapper], Sequence[Sequence[int]]]
+        self,
+        tokenizer: TokenizerSpec,
+        prompt_format_fn: Callable[[CutSet, TokenizerWrapper, bool], Sequence[Sequence[int]]],
+        inference: bool = False,
     ):
         super().__init__()
         self.tokenizer = TokenizerWrapper(tokenizer)
         self.load_audio = AudioSamples(fault_tolerant=True)
         self.padding_value = self.tokenizer._tokenizer.pad_id
         self.prompt_format_fn = prompt_format_fn
+        self.inference = inference
 
     def __getitem__(self, cuts: CutSet) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         audio, audio_lens, cuts = self.load_audio(cuts)
 
-        tokens = self.prompt_format_fn(cuts, self.tokenizer)
+        tokens = self.prompt_format_fn(cuts, self.tokenizer, self.inference)
         tokens = [torch.as_tensor(t) for t in tokens]
         token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long)
         tokens = collate_vectors(tokens, padding_value=self.padding_value)
@@ -64,7 +68,7 @@ def __getitem__(self, cuts: CutSet) -> tuple[torch.Tensor, torch.Tensor, torch.T
 PROMPT_FORMAT_FNS = {}
 
 
-def registered_prompt_format_fn(prompt_fn: Callable[[CutSet, TokenizerWrapper], Sequence[Sequence[int]]]):
+def registered_prompt_format_fn(prompt_fn: Callable[[CutSet, TokenizerWrapper, bool], Sequence[Sequence[int]]]):
     """
     Decorator for registering prompt functions under a name.
 
@@ -82,7 +86,7 @@ def registered_prompt_format_fn(prompt_fn: Callable[[CutSet, TokenizerWrapper],
     return prompt_fn
 
 
-def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper], Sequence[Sequence[int]]]:
+def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper, bool], Sequence[Sequence[int]]]:
     if name not in PROMPT_FORMAT_FNS:
         raise ValueError(
             f"Unknown prompt format function name: {name} " f"(must be one of: {list(PROMPT_FORMAT_FNS.keys())}"
@@ -91,7 +95,7 @@ def get_prompt_format_fn(name: str) -> Callable[[CutSet, TokenizerWrapper], Sequ
 
 
 @registered_prompt_format_fn
-def canary(cuts: CutSet, tokenizer: TokenizerWrapper) -> Sequence[Sequence[int]]:
+def canary(cuts: CutSet, tokenizer: TokenizerWrapper, inference: bool = False) -> Sequence[Sequence[int]]:
     """
     Prepend and append control tokens to the token sequence as per Canary format.
 
@@ -135,8 +139,11 @@ def canary(cuts: CutSet, tokenizer: TokenizerWrapper) -> Sequence[Sequence[int]]
             )
 
         # Actual tokenization. If a cut has multiple supervisions, we'll stitch their tokenized texts together.
-        texts = [sup.text for sup in cut.supervisions]
-        langs = [sup.language for sup in cut.supervisions]
+        if not inference:
+            texts = [sup.text for sup in cut.supervisions]
+            langs = [sup.language for sup in cut.supervisions]
+        else:
+            texts, langs = None, None
         taskname = cut.custom['taskname']
         pnc = cut.custom['pnc']
         source_lang = cut.custom['source_lang']
@@ -149,18 +156,29 @@ def canary(cuts: CutSet, tokenizer: TokenizerWrapper) -> Sequence[Sequence[int]]
     return canary_tokens
 
 
-def canary_prompt(tokenizer: CanaryTokenizer, text, language, source_language, target_language, taskname, pnc):
+def canary_prompt(
+    tokenizer: CanaryTokenizer,
+    text: str | list[str] | None,
+    language: str | list[str] | None,
+    source_language: str,
+    target_language: str,
+    taskname: str,
+    pnc: str,
+) -> list[int]:
     if isinstance(text, str):
         text = [text]
     if isinstance(language, str):
         language = [language]
 
-    tokens = sum((tokenizer.text_to_ids(text_, lang_) for text_, lang_ in zip(text, language)), start=[])
+    if text is not None:
+        tokens = sum((tokenizer.text_to_ids(text_, lang_) for text_, lang_ in zip(text, language)), start=[])
+    else:
+        tokens = None  # create prompt for inference
 
     # bos
     prompted_tokens = [tokenizer.bos_id]
 
-    if len(tokens) == 0:
+    if tokens is not None and len(tokens) == 0:
         # no speech token
         prompted_tokens.append(tokenizer.nospeech_id)
     else:
@@ -201,9 +219,11 @@ def canary_prompt(tokenizer: CanaryTokenizer, text, language, source_language, t
         else:
             raise ValueError(f"Unknown value for key 'pnc': {pnc}")
 
-        # text
-        prompted_tokens.extend(tokens)
+        # text (only in training)
+        if tokens is not None:
+            prompted_tokens.extend(tokens)
 
-    # eos
-    prompted_tokens.append(tokenizer.eos_id)
+    # eos (only in training)
+    if tokens is not None:
+        prompted_tokens.append(tokenizer.eos_id)
     return prompted_tokens
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 4fb88b208076..d0cd40339b42 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -202,8 +202,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             tokenizer=self.tokenizer,
         )
 
-        self.context_len_for_AR_decoding = self.cfg.get("context_len_for_AR_decoding", 5)
-
         # Define autoregressive CE loss
         with open_dict(self.cfg.loss):
             self.cfg.loss.pad_id = self.tokenizer.pad_id
@@ -442,7 +440,7 @@ def transcribe(
 
         return super().transcribe(audio=audio, override_config=trcfg)
 
-    def _setup_dataloader_from_config(self, config: Optional[Dict]):
+    def _setup_dataloader_from_config(self, config: Optional[Dict], inference: bool = False):
         assert config.get("use_lhotse", False), (
             "Multi-task model only supports dataloading with Lhotse. "
             "Please set config.{train,validation,test}_ds.use_lhotse=True"
@@ -452,7 +450,9 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             global_rank=self.global_rank,
             world_size=self.world_size,
             dataset=PromptedAudioToTextLhotseDataset(
-                tokenizer=self.tokenizer, prompt_format_fn=get_prompt_format_fn(self.prompt_format),
+                tokenizer=self.tokenizer,
+                prompt_format_fn=get_prompt_format_fn(self.prompt_format),
+                inference=inference,
             ),
         )
 
@@ -496,7 +496,7 @@ def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict
 
         # preserve config
         self._update_dataset_config(dataset_name='validation', config=val_data_config)
-        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
+        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config, inference=True)
 
     def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
         """
@@ -512,7 +512,7 @@ def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
 
         # preserve config
         self._update_dataset_config(dataset_name='test', config=test_data_config)
-        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
+        self._test_dl = self._setup_dataloader_from_config(config=test_data_config, inference=True)
 
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
@@ -660,9 +660,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"):
         beam_hypotheses = self.decoding.decode_predictions_tensor(
             encoder_hidden_states=enc_states,
             encoder_input_mask=enc_mask,
-            decoder_input_ids=input_ids[:, : self.context_len_for_AR_decoding]
-            if self.context_len_for_AR_decoding > 0
-            else None,
+            decoder_input_ids=input_ids,
             return_hypotheses=False,
         )[0]
 
@@ -839,14 +837,7 @@ def _transcribe_forward(self, batch: Any, trcfg: MultiTaskTranscriptionConfig):
         log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch[0], input_signal_length=batch[1]
         )
-
-        decoder_input_ids = (
-            batch[2][:, : self.context_len_for_AR_decoding].to(trcfg._internal.device)
-            if self.context_len_for_AR_decoding > 0
-            else None
-        )
-        # decoder_input_ids = None
-
+        decoder_input_ids = batch[2].to(trcfg._internal.device)
         output = dict(
             log_probs=log_probs,
             encoded_lengths=encoded_len,
@@ -881,7 +872,7 @@ def _transcribe_output_processing(self, outputs, trcfg: MultiTaskTranscriptionCo
         best_hypotheses, all_hypotheses = self.decoding.decode_predictions_tensor(
             encoder_hidden_states=enc_states,
             encoder_input_mask=enc_mask,
-            decoder_input_ids=decoder_input_ids if self.context_len_for_AR_decoding > 0 else None,
+            decoder_input_ids=decoder_input_ids,
             return_hypotheses=trcfg.return_hypotheses,
         )
 
@@ -933,7 +924,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'lang_field': 'target_lang',
         }
 
-        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
+        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config), inference=True)
         return temporary_datalayer
 
     def _transcribe_on_end(self, trcfg: MultiTaskTranscriptionConfig):
@@ -1022,9 +1013,7 @@ def predict_step(self, batch, batch_idx=0, dataloader_idx=0, has_processed_signa
         text = self.decoding.decode_predictions_tensor(
             encoder_hidden_states=enc_states,
             encoder_input_mask=enc_mask,
-            decoder_input_ids=transcript[:, : self.context_len_for_AR_decoding]
-            if self.context_len_for_AR_decoding > 0
-            else None,
+            decoder_input_ids=transcript,
             return_hypotheses=False,
         )[0]
 
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 8d1a007cd915..036dfe5eb15b 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -90,7 +90,7 @@ def __iter__(self) -> Generator[Cut, None, None]:
                     recording_id=cut.recording_id,
                     start=0,
                     duration=cut.duration,
-                    text=data[self.text_field],
+                    text=data.get(self.text_field),
                     language=data.get(self.lang_field),
                 )
             )
@@ -257,7 +257,7 @@ def __iter__(self) -> Generator[Cut, None, None]:
                             recording_id=cut.recording_id,
                             start=0,
                             duration=cut.duration,
-                            text=data[self.text_field],
+                            text=data.get(self.text_field),
                             language=data.get(self.lang_field),
                         )
                     )
diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
index c701403580ea..690010ad29ca 100644
--- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
+++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py
@@ -364,7 +364,7 @@ def create_new_dataset(self, manifest_path: str, target_dir: str = "./tarred/",
                 new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json')
                 with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2:
                     for entry in manifest:
-                        json.dump(entry, m2)
+                        json.dump(entry, m2, ensure_ascii=False)
                         m2.write('\n')
 
         # Flatten the list of list of entries to a list of entries
@@ -377,7 +377,7 @@ def create_new_dataset(self, manifest_path: str, target_dir: str = "./tarred/",
         new_manifest_path = os.path.join(target_dir, 'tarred_audio_manifest.json')
         with open(new_manifest_path, 'w', encoding='utf-8') as m2:
             for entry in new_entries:
-                json.dump(entry, m2)
+                json.dump(entry, m2, ensure_ascii=False)
                 m2.write('\n')
 
         # Write metadata (default metadata for new datasets)
@@ -555,7 +555,7 @@ def create_concatenated_dataset(
                 new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json')
                 with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2:
                     for entry in manifest:
-                        json.dump(entry, m2)
+                        json.dump(entry, m2, ensure_ascii=False)
                         m2.write('\n')
 
         # Flatten the list of list of entries to a list of entries
@@ -574,12 +574,12 @@ def create_concatenated_dataset(
         with open(new_manifest_path, 'w', encoding='utf-8') as m2:
             # First write all the entries of base manifest
             for entry in base_entries:
-                json.dump(entry, m2)
+                json.dump(entry, m2, ensure_ascii=False)
                 m2.write('\n')
 
             # Finally write the new entries
             for entry in new_entries:
-                json.dump(entry, m2)
+                json.dump(entry, m2, ensure_ascii=False)
                 m2.write('\n')
 
         # Preserve historical metadata
@@ -679,24 +679,12 @@ def _create_shard(self, entries, target_dir, shard_id, manifest_folder):
                 to_write = base + "-sub" + str(count[squashed_filename]) + ext
                 count[squashed_filename] += 1
 
+            # Carry over every key in the entry, override audio_filepath and shard_id
             new_entry = {
+                **entry,
                 'audio_filepath': to_write,
-                'duration': entry['duration'],
                 'shard_id': shard_id,  # Keep shard ID for recordkeeping
             }
-
-            if 'label' in entry:
-                new_entry['label'] = entry['label']
-
-            if 'text' in entry:
-                new_entry['text'] = entry['text']
-
-            if 'offset' in entry:
-                new_entry['offset'] = entry['offset']
-
-            if 'lang' in entry:
-                new_entry['lang'] = entry['lang']
-
             new_entries.append(new_entry)
 
         tar.close()
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 4d97db537b09..5d177b39c5eb 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -246,8 +246,9 @@ def test_dataloader_from_nemo_manifest(nemo_manifest_path: Path):
             "batch_duration": 4.0,  # seconds
             "quadratic_duration": 15.0,  # seconds
             "shuffle_buffer_size": 10,
-            "buffer_size": 100,
+            "bucket_buffer_size": 100,
             "seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -288,6 +289,7 @@ def test_dataloader_from_nemo_manifest_has_custom_fields(nemo_manifest_path: Pat
             "batch_duration": 4.0,  # seconds
             "shuffle_buffer_size": 10,
             "seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -322,6 +324,7 @@ def test_dataloader_from_tarred_nemo_manifest(nemo_tarred_manifest_path: tuple[s
             "shuffle_buffer_size": 10,
             "bucket_buffer_size": 100,
             "seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -368,6 +371,7 @@ def test_dataloader_from_tarred_nemo_manifest_weighted_combination(nemo_tarred_m
             "shuffle_buffer_size": 10,
             "bucket_buffer_size": 100,
             "seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -399,6 +403,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi(nemo_tarred_manifest_path_mu
             "shuffle_buffer_size": 10,
             "bucket_buffer_size": 100,
             "seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -446,6 +451,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi_max_open_streams(nemo_tarred
             "shuffle_buffer_size": 10,
             "bucket_buffer_size": 100,
             "seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -475,6 +481,7 @@ def test_dataloader_from_tarred_nemo_manifest_concat(nemo_tarred_manifest_path:
             "drop_last": False,
             "shuffle_buffer_size": 10,
             "seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -534,7 +541,7 @@ def test_dataloader_from_lhotse_shar_cuts_combine_datasets_unweighted(
             "shuffle_buffer_size": 10,
             "bucket_buffer_size": 100,
             "seed": 0,
-            "shar_seed": 0,
+            "shard_seed": 0,
         }
     )
 
@@ -587,7 +594,7 @@ def test_dataloader_from_lhotse_shar_cuts_combine_datasets_weighted(
             "shuffle_buffer_size": 10,
             "bucket_buffer_size": 100,
             "seed": 0,
-            "shar_seed": 0,
+            "shard_seed": 0,
         }
     )
 

From 1e1cf63a9ea0651c776f84c19e2848e11fe2d042 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Fri, 16 Feb 2024 18:58:48 -0600
Subject: [PATCH 20/28] add  sbert to IR (#8445)

* add  sbert to IR

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* add doc

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* fix the  auto_tokenizer property method reset bug

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* addressed bot comments

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: ataghibakhsh <ataghibakhsh@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 docs/source/nlp/information_retrieval.rst     |  96 ++-
 .../conf/megatron_sbert_config.yaml           | 160 ++++
 .../megatron_sbert_finetune.py                |  59 ++
 .../tokenizers/huggingface/auto_tokenizer.py  |   3 -
 .../bert_embedding_dataset.py                 |  93 ++
 .../megatron_sbert_model.py                   | 795 ++++++++++++++++++
 6 files changed, 1198 insertions(+), 8 deletions(-)
 create mode 100644 examples/nlp/information_retrieval/conf/megatron_sbert_config.yaml
 create mode 100644 examples/nlp/information_retrieval/megatron_sbert_finetune.py
 create mode 100644 nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
 create mode 100644 nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py

diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index 3c71ffcfcd12..5cf87143848c 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -1,10 +1,96 @@
 .. _information_retrieval:
 
-Information Retrieval
-=====================
+Sentence-BERT
+=============
 
-We recommend you try the Information Retrieval model in a Jupyter notebook (can run on `Google's Colab <https://colab.research.google.com/notebooks/intro.ipynb>`_): `NeMo/tutorials/nlp/Information_Retrieval_MSMARCO.ipynb <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Information_Retrieval_MSMARCO.ipynb>`__.
+Sentence-BERT (SBERT) is a modification of the BERT model that is specifically trained to generate semantically meaningful sentence embeddings. 
+The model architecture and pre-training process are detailed in the `Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks <https://aclanthology.org/D19-1410.pdf>`__ paper. Similar to BERT, 
+Sentence-BERT utilizes a BERT-based architecture, but it is trained using a siamese and triplet network structure to derive fixed-sized sentence embeddings that capture semantic information. 
+Sentence-BERT is commonly used to generate high-quality sentence embeddings for various downstream natural language processing tasks, such as semantic textual similarity, clustering, and information retrieval
 
-Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for hardware the accelerator),
+Data Input for the Senntence-BERT model
+---------------------------------------
 
-An example script on how to train the model can be found here: `NeMo/examples/nlp/information_retrieval <https://github.com/NVIDIA/NeMo/tree/stable/examples/nlp/information_retrieval>`__.
+The fine-tuning data for the Sentence-BERT (SBERT) model should consist of data instances, 
+each comprising a query, a positive document, and a list of negative documents. Negative mining is 
+not supported in NeMo yet; therefore, data preprocessing should be performed offline before training. 
+The dataset should be in JSON format. For instance, the dataset should have the following structure:
+
+.. code-block:: python
+
+    [
+        {
+            "question": "Query",
+            "pos_doc": ["Positive"],
+            "neg_doc": ["Negative_1", "Negative_2", ..., "Negative_n"]
+        },
+        {
+            // Next data instance
+        },
+        ...,
+        {
+            // Subsequent data instance
+        }
+    ]
+
+This format ensures that the fine-tuning data is appropriately structured for training the Sentence-BERT model.
+
+
+Fine-tuning the Sentence-BERT model
+-----------------------------------
+
+For fine-tuning Sentence-BERT model, you need to initialze the Sentence-BERT model with BERT model
+checkpoint. To do so, you should either have a ``.nemo`` checkpoint or need to convert a HuggingFace
+BERT checkpoint to NeMo using the following:
+
+.. code-block:: python
+
+     python NeMo/scripts/nlp_language_modeling/convert_bert_hf_to_nemo.py \
+            --input_name_or_path "intfloat/e5-large-unsupervised" \
+            --output_path /path/to/output/nemo/file.nemo 
+
+Then you can fine-tune the sentence-BERT model using the following script:
+
+.. code-block:: python
+
+
+    #!/bin/bash
+
+    PROJECT= # wandb project name
+    NAME= # wandb run name
+    export WANDB_API_KEY= # your_wandb_key
+
+
+    NUM_DEVICES=1 # number of gpus to train on
+
+
+    CONFIG_PATH="/NeMo/examples/nlp/information_retrieval/conf/"
+    CONFIG_NAME="megatron_bert_config"
+    PATH_TO_NEMO_MODEL= # Path to conveted nemo model from hf
+    DATASET_PATH= # Path to json dataset 
+    SAVE_DIR= # where the checkpoint and logs are saved
+    mkdir -p $SAVE_DIR
+
+
+    python /NeMo/examples/nlp/language_modeling/megatron_sbert_pretraining.py \
+    --config-path=${CONFIG_PATH} \
+    --config-name=${CONFIG_NAME} \
+    restore_from_path=${PATH_TO_NEMO_MODEL} \
+    trainer.devices=${NUM_DEVICES} \
+    trainer.val_check_interval=100 \
+    trainer.max_epochs=1 \
+    +trainer.num_sanity_val_steps=0 \
+    model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size
+    model.micro_batch_size=8 \
+    model.tokenizer.library="huggingface" \
+    model.tokenizer.type="intfloat/e5-large-unsupervised" \
+    ++model.data.data_prefix=${DATASET_PATH} \
+    ++model.tokenizer.do_lower_case=False \
+    ++model.data.evaluation_sample_size=100 \
+    ++model.data.hard_negatives_to_train=4 \
+    ++model.data.evaluation_steps=100 \
+    exp_manager.explicit_log_dir=${SAVE_DIR} \
+    exp_manager.create_wandb_logger=True \
+    exp_manager.resume_if_exists=True \
+    exp_manager.wandb_logger_kwargs.name=${NAME} \
+    exp_manager.wandb_logger_kwargs.project=${PROJECT}
diff --git a/examples/nlp/information_retrieval/conf/megatron_sbert_config.yaml b/examples/nlp/information_retrieval/conf/megatron_sbert_config.yaml
new file mode 100644
index 000000000000..c58d120dad0c
--- /dev/null
+++ b/examples/nlp/information_retrieval/conf/megatron_sbert_config.yaml
@@ -0,0 +1,160 @@
+name: megatron_bert
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_bert
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_bert--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+
+model:
+  # model parallelism 
+  mcore_bert: False
+  micro_batch_size: 4
+  global_batch_size: 8
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+
+  # model architecture
+  encoder_seq_length: 512
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 24
+  hidden_size: 1024
+  ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 16
+  skip_head: True
+  transformer_block_type: post_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number.
+  normalization: layernorm
+  layernorm_epsilon: 1e-12
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  bert_binary_head: True # BERT binary head
+  megatron_legacy: True
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'intfloat/e5-large-unsupervised'
+    model: null
+    vocab_file: null
+    merge_file: null 
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: False 
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  data:
+    # Path to data must be specified by the user.
+    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    data_prefix: [1.0, /path/to/data]
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 0
+    dataloader_type: single  # cyclic, LDDL
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    masked_lm_prob: 0.15 # Probability of replacing a token with mask.
+    short_seq_prob: 0.1 # Probability of producing a short sequence.
+  
+  optim:
+    name: fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/information_retrieval/megatron_sbert_finetune.py b/examples/nlp/information_retrieval/megatron_sbert_finetune.py
new file mode 100644
index 000000000000..050db34510e5
--- /dev/null
+++ b/examples/nlp/information_retrieval/megatron_sbert_finetune.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf, open_dict
+
+from nemo.collections.nlp.models.information_retrieval.megatron_sbert_model import MegatronSBertModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder
+from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="conf", config_name="megatron_bert_config")
+def main(cfg) -> None:
+    if cfg.model.data.dataloader_type != "LDDL":
+        mp.set_start_method("spawn", force=True)
+
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronSBertModel.merge_cfg_with(cfg.restore_from_path, cfg)
+
+    assert (
+        model_cfg.micro_batch_size * cfg.trainer.devices == model_cfg.global_batch_size
+    ), "Gradiant accumulation is not supported for contrastive learning yet"
+
+    OmegaConf.set_struct(model_cfg, True)
+    with open_dict(model_cfg):
+        model_cfg.precision = trainer.precision
+
+    model = MegatronSBertModel.restore_from(
+        restore_path=cfg.restore_from_path,
+        trainer=trainer,
+        save_restore_connector=NLPSaveRestoreConnector(),
+        override_config_path=model_cfg,
+        strict=True,
+    )
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index 4ed5dc07dbff..85f9af6e3df2 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -122,9 +122,6 @@ def __init__(
             if token is not None and token not in self.tokenizer.get_vocab():
                 new_tokens_in_vocab.append(token)
 
-        # value is required for megatron-core
-        self.unique_identifiers = OrderedDict()
-
         if len(new_tokens_in_vocab) > 0:
             """
             Special tokens that were not previously included in the tokenizer's vocabulary file will be added to 
diff --git a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
new file mode 100644
index 000000000000..038b1c47ec56
--- /dev/null
+++ b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from typing import Dict, List
+
+from torch.utils.data import Dataset
+
+
+class BertEmbeddingDataset(Dataset):
+    """SentenceTransformer tokenizer and MultipleNegativesRankingLoss expects
+        a single positive and a single hard-negative (optional) per example.
+        This Dataset manages the case where there is more than one positive or negative
+        available, in form of a list.
+        It uses the list of positives/negatives as a queue, where for each epoch the 
+        first positive/negative of the queue is used for training, after which the
+        item is moved to the end of the queue.
+        If num_hard_negs > 1, multiple negatives will be sampled for each example.
+
+        Args:
+            data (List[Dict[str, str]]): A list of Dict whose 
+            keys are "question", "pos_doc", "neg_doc"
+            num_hard_negs (int): Number of hard-negatives for each query to sample
+            shuffled_negs (bool, optional): Whether the negatives per example
+            needs to be shuffled in the initialization. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        data: List[Dict[str, str]],
+        shuffled_negs: bool = False,
+        num_hard_negs: int = 1,
+        query_prefix: str = "",
+        passage_prefix: str = "",
+    ):
+        self.data = data
+        self.num_hard_negs = num_hard_negs
+        self.query_prefix = query_prefix
+        self.passage_prefix = passage_prefix
+
+        if shuffled_negs:
+            for example in self.data:
+                random.shuffle(example["neg_doc"])
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, item):
+
+        example = self.data[item]
+        question = f'{self.query_prefix} {example["question"]}'.strip()
+        texts = [question]
+
+        positive = example["pos_doc"]
+        if isinstance(positive, list):
+
+            positive = example["pos_doc"][0]
+
+        positive = f"{self.passage_prefix} {positive}".strip()
+        texts.append(positive)
+
+        negative = []
+        if "neg_doc" in example:
+            negative = example["neg_doc"]
+            selected_negs = []
+            if isinstance(negative, list):
+                for counter in range(self.num_hard_negs):
+                    if len(example["neg_doc"]) > 0:
+
+                        negative = example["neg_doc"][counter]
+                        selected_negs.append(negative)
+                    else:
+                        # Providing empty hard-negative, for this example,
+                        # so that it matches the number of hard negatives
+                        # of the other examples
+                        selected_negs.append("")
+
+            else:
+                selected_negs = [negative]
+            selected_negs = [f"{self.passage_prefix} {neg}".strip() for neg in selected_negs]
+            texts.extend(selected_negs)
+        return texts
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py
new file mode 100644
index 000000000000..0d312845db58
--- /dev/null
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_sbert_model.py
@@ -0,0 +1,795 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import random
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from omegaconf import DictConfig, OmegaConf, open_dict
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+from torch import Tensor, nn
+
+from nemo.collections.nlp.data.information_retrieval.bert_embedding_dataset import BertEmbeddingDataset
+from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
+    MegatronPretrainingRandomSampler,
+    MegatronPretrainingSampler,
+)
+from nemo.collections.nlp.models.language_modeling.megatron.bert_model import BertModel, bert_extended_attention_mask
+from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    ApexGuardDefaults,
+    average_losses_across_data_parallel_group,
+    build_position_ids,
+)
+from nemo.utils import logging
+
+try:
+    from megatron.core import ModelParallelConfig, parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    ModelParallelConfig = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+
+
+def set_seed(seed: int = 42) -> None:
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    # When running on the CuDNN backend, two further options must be set
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    # Set a fixed value for the hash seed
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    print(f"Random seed set as {seed}")
+
+
+##########################
+# Below class is copied from SentenceTransformer library: https://github.com/UKPLab/sentence-transformers/blob/08a57b4a19ddaf7cccda51cd0c2c8af7bbc339a3/sentence_transformers/models/Normalize.py
+##########################
+
+
+class Normalize(nn.Module):
+    """
+    This layer normalizes embeddings to unit length
+    """
+
+    def __init__(self):
+        super(Normalize, self).__init__()
+
+    def forward(self, features: Dict[str, Tensor]):
+        features.update({"sentence_embedding": F.normalize(features["sentence_embedding"], p=2, dim=1)})
+        return features
+
+
+##########################
+# Below class is copied from SentenceTransformer library: https://github.com/UKPLab/sentence-transformers/blob/08a57b4a19ddaf7cccda51cd0c2c8af7bbc339a3/sentence_transformers/models/Pooling.py
+##########################
+
+
+class Pooling(nn.Module):
+    """Performs pooling (max or mean) on the token embeddings.
+
+    Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
+    You can concatenate multiple poolings together.
+
+    :param word_embedding_dimension: Dimensions for the word embeddings
+    :param pooling_mode: Can be a string: mean/max/cls. If set, overwrites the other pooling_mode_* settings
+    :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
+    :param pooling_mode_max_tokens: Use max in each dimension over all tokens.
+    :param pooling_mode_mean_tokens: Perform mean-pooling
+    :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but divide by sqrt(input_length).
+    :param pooling_mode_weightedmean_tokens: Perform (position) weighted mean pooling, see https://arxiv.org/abs/2202.08904
+    :param pooling_mode_lasttoken: Perform last token pooling, see https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005
+    """
+
+    def __init__(
+        self,
+        word_embedding_dimension: int,
+        pooling_mode: str = None,
+        pooling_mode_cls_token: bool = False,
+        pooling_mode_max_tokens: bool = False,
+        pooling_mode_mean_tokens: bool = True,
+        pooling_mode_mean_sqrt_len_tokens: bool = False,
+        pooling_mode_weightedmean_tokens: bool = False,
+        pooling_mode_lasttoken: bool = False,
+    ):
+        super(Pooling, self).__init__()
+
+        self.config_keys = [
+            "word_embedding_dimension",
+            "pooling_mode_cls_token",
+            "pooling_mode_mean_tokens",
+            "pooling_mode_max_tokens",
+            "pooling_mode_mean_sqrt_len_tokens",
+            "pooling_mode_weightedmean_tokens",
+            "pooling_mode_lasttoken",
+        ]
+
+        if pooling_mode is not None:  # Set pooling mode by string
+            pooling_mode = pooling_mode.lower()
+            assert pooling_mode in ["mean", "max", "cls", "weightedmean", "lasttoken"]
+            pooling_mode_cls_token = pooling_mode == "cls"
+            pooling_mode_max_tokens = pooling_mode == "max"
+            pooling_mode_mean_tokens = pooling_mode == "mean"
+            pooling_mode_weightedmean_tokens = pooling_mode == "weightedmean"
+            pooling_mode_lasttoken = pooling_mode == "lasttoken"
+
+        self.word_embedding_dimension = word_embedding_dimension
+        self.pooling_mode_cls_token = pooling_mode_cls_token
+        self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
+        self.pooling_mode_max_tokens = pooling_mode_max_tokens
+        self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
+        self.pooling_mode_weightedmean_tokens = pooling_mode_weightedmean_tokens
+        self.pooling_mode_lasttoken = pooling_mode_lasttoken
+
+        pooling_mode_multiplier = sum(
+            [
+                pooling_mode_cls_token,
+                pooling_mode_max_tokens,
+                pooling_mode_mean_tokens,
+                pooling_mode_mean_sqrt_len_tokens,
+                pooling_mode_weightedmean_tokens,
+                pooling_mode_lasttoken,
+            ]
+        )
+        self.pooling_output_dimension = pooling_mode_multiplier * word_embedding_dimension
+
+    def __repr__(self):
+        return "Pooling({})".format(self.get_config_dict())
+
+    def forward(self, features: Dict[str, Tensor]):
+        token_embeddings = features["token_embeddings"]
+        attention_mask = features["attention_mask"]
+
+        ## Pooling strategy
+        output_vectors = []
+        if self.pooling_mode_cls_token:
+            cls_token = features.get("cls_token_embeddings", token_embeddings[:, 0])  # Take first token by default
+            output_vectors.append(cls_token)
+        if self.pooling_mode_max_tokens:
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
+            max_over_time = torch.max(token_embeddings, 1)[0]
+            output_vectors.append(max_over_time)
+        if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+
+            # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
+            if "token_weights_sum" in features:
+                sum_mask = features["token_weights_sum"].unsqueeze(-1).expand(sum_embeddings.size())
+            else:
+                sum_mask = input_mask_expanded.sum(1)
+
+            sum_mask = torch.clamp(sum_mask, min=1e-9)
+
+            if self.pooling_mode_mean_tokens:
+                output_vectors.append(sum_embeddings / sum_mask)
+            if self.pooling_mode_mean_sqrt_len_tokens:
+                output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
+        if self.pooling_mode_weightedmean_tokens:
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            # token_embeddings shape: bs, seq, hidden_dim
+            weights = (
+                torch.arange(start=1, end=token_embeddings.shape[1] + 1)
+                .unsqueeze(0)
+                .unsqueeze(-1)
+                .expand(token_embeddings.size())
+                .float()
+                .to(token_embeddings.device)
+            )
+            assert weights.shape == token_embeddings.shape == input_mask_expanded.shape
+            input_mask_expanded = input_mask_expanded * weights
+
+            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+
+            # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
+            if "token_weights_sum" in features:
+                sum_mask = features["token_weights_sum"].unsqueeze(-1).expand(sum_embeddings.size())
+            else:
+                sum_mask = input_mask_expanded.sum(1)
+
+            sum_mask = torch.clamp(sum_mask, min=1e-9)
+            output_vectors.append(sum_embeddings / sum_mask)
+        if self.pooling_mode_lasttoken:
+            bs, seq_len, hidden_dim = token_embeddings.shape
+            # attention_mask shape: (bs, seq_len)
+            # Get shape [bs] indices of the last token (i.e. the last token for each batch item)
+            # argmin gives us the index of the first 0 in the attention mask; We get the last 1 index by subtracting 1
+            # Any sequence where min == 1, we use the entire sequence length since argmin = 0
+            values, indices = torch.min(attention_mask, 1, keepdim=False)
+            gather_indices = torch.where(values == 0, indices, seq_len) - 1  # Shape [bs]
+
+            # There are empty sequences, where the index would become -1 which will crash
+            gather_indices = torch.clamp(gather_indices, min=0)
+
+            # Turn indices from shape [bs] --> [bs, 1, hidden_dim]
+            gather_indices = gather_indices.unsqueeze(-1).repeat(1, hidden_dim)
+            gather_indices = gather_indices.unsqueeze(1)
+            assert gather_indices.shape == (bs, 1, hidden_dim)
+
+            # Gather along the 1st dim (seq_len) (bs, seq_len, hidden_dim -> bs, hidden_dim)
+            # Actually no need for the attention mask as we gather the last token where attn_mask = 1
+            # but as we set some indices (which shouldn't be attended to) to 0 with clamp, we
+            # use the attention mask to ignore them again
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            embedding = torch.gather(token_embeddings * input_mask_expanded, 1, gather_indices).squeeze(dim=1)
+            output_vectors.append(embedding)
+
+        output_vector = torch.cat(output_vectors, 1)
+        features.update({"sentence_embedding": output_vector})
+        return features
+
+    def get_sentence_embedding_dimension(self):
+        return self.pooling_output_dimension
+
+    def get_config_dict(self):
+        return {key: self.__dict__[key] for key in self.config_keys}
+
+
+class SBertModel(BertModel):
+    """
+    Bert Language model.
+    Model returns [seq, batch, hidden] shape
+    """
+
+    def __init__(
+        self,
+        config: ModelParallelConfig,
+        vocab_size,
+        hidden_size,
+        max_position_embeddings,
+        num_layers,
+        num_attention_heads,
+        ffn_hidden_size,
+        apply_query_key_layer_scaling=True,
+        kv_channels=None,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+        init_method_std=0.02,
+        fp16_lm_cross_entropy=False,
+        hidden_dropout=0.1,
+        precision=16,
+        fp32_residual_connection=False,
+        activations_checkpoint_granularity=None,
+        activations_checkpoint_method=None,
+        activations_checkpoint_num_layers=1,
+        activations_checkpoint_layers_per_pipeline=None,
+        layernorm_epsilon=1e-5,
+        normalization='layernorm',
+        transformer_block_type='pre_ln',
+        masked_softmax_fusion=False,
+        bias_gelu_fusion=True,
+        bias_dropout_add_fusion=True,
+        openai_gelu=False,
+        onnx_safe=False,
+        add_binary_head=True,
+        skip_head=False,
+        megatron_legacy=False,
+        sequence_parallel=False,
+        position_embedding_type='learned_absolute',
+    ):
+        super().__init__(
+            config,
+            vocab_size,
+            hidden_size,
+            max_position_embeddings,
+            num_layers,
+            num_attention_heads,
+            ffn_hidden_size,
+            apply_query_key_layer_scaling,
+            kv_channels,
+            num_tokentypes,
+            parallel_output,
+            pre_process,
+            post_process,
+            init_method_std,
+            fp16_lm_cross_entropy,
+            hidden_dropout,
+            precision,
+            fp32_residual_connection,
+            activations_checkpoint_granularity,
+            activations_checkpoint_method,
+            activations_checkpoint_num_layers,
+            activations_checkpoint_layers_per_pipeline,
+            layernorm_epsilon,
+            normalization,
+            transformer_block_type,
+            masked_softmax_fusion,
+            bias_gelu_fusion,
+            bias_dropout_add_fusion,
+            openai_gelu,
+            onnx_safe,
+            add_binary_head,
+            skip_head,
+            megatron_legacy,
+            sequence_parallel,
+            position_embedding_type,
+        )
+
+        self.pooling_add_on = Pooling(
+            word_embedding_dimension=1024,
+            pooling_mode_cls_token=False,
+            pooling_mode_mean_tokens=True,
+            pooling_mode_max_tokens=False,
+            pooling_mode_mean_sqrt_len_tokens=False,
+        )
+
+        self.normalize_add_on = Normalize()
+
+    def forward(
+        self,
+        bert_model_input,
+        attention_mask,
+        token_type_ids=None,
+        lm_labels=None,
+        checkpoint_activations_all_layers=None,
+    ):
+
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+
+        if parallel_state.is_pipeline_first_stage():
+            input_ids = bert_model_input
+            position_ids = build_position_ids(input_ids)
+        else:
+            position_ids = None
+            input_ids = None
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            token_type_ids=token_type_ids,
+            checkpoint_activations_all_layers=checkpoint_activations_all_layers,
+        )
+
+        if self.post_process and self.add_binary_head:
+
+            lm_output, _ = lm_output
+
+        add_on_inputs = {"token_embeddings": lm_output[0].permute(1, 0, 2), "attention_mask": attention_mask}
+        lm_output = self.pooling_add_on(add_on_inputs)
+        lm_output = self.normalize_add_on(lm_output)
+
+        return lm_output['sentence_embedding']
+
+
+class MegatronSBertModel(MegatronBertModel):
+    """
+    Megatron Bert pretraining.
+    Model returns [batch, seq, hidden] shape
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+
+        super().__init__(cfg, trainer=trainer)
+
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss(label_smoothing=cfg.get('label_smoothing', 0.0))
+        softmax_temp = cfg.get('softmax_temp', 0.05)
+        self.scale = 1.0 / softmax_temp
+        train_file_path = self.cfg.data.data_prefix
+        with open(train_file_path) as f:
+            train_data = json.load(f)
+
+        random_seed = 42
+        set_seed(random_seed)
+        random.shuffle(train_data)
+
+        self.train_data = train_data
+
+    def model_provider_func(self, pre_process, post_process):
+        cfg = self.cfg
+        num_tokentypes = 2 if cfg.bert_binary_head else 0
+
+        if self.mcore_bert:
+            raise ValueError("mcore not supported for SBERT")
+
+        else:
+            model = SBertModel(
+                config=self.model_parallel_config,
+                vocab_size=self.padded_vocab_size,
+                hidden_size=cfg.hidden_size,
+                max_position_embeddings=cfg.max_position_embeddings,
+                num_layers=cfg.num_layers,
+                num_attention_heads=cfg.num_attention_heads,
+                apply_query_key_layer_scaling=cfg.get('apply_query_key_layer_scaling', True),
+                kv_channels=cfg.get('kv_channels', None),
+                ffn_hidden_size=cfg.ffn_hidden_size,
+                num_tokentypes=num_tokentypes,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process,
+                init_method_std=cfg.get('init_method_std', 0.02),
+                fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
+                hidden_dropout=cfg.get('hidden_dropout', 0.1),
+                precision=cfg.get('precision', 16),
+                fp32_residual_connection=cfg.get('fp32_residual_connection', False),
+                activations_checkpoint_granularity=self.cfg.get('activations_checkpoint_granularity', None),
+                activations_checkpoint_method=self.cfg.get('activations_checkpoint_method', None),
+                activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1),
+                activations_checkpoint_layers_per_pipeline=self.cfg.get(
+                    'activations_checkpoint_layers_per_pipeline', None
+                ),
+                layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
+                masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
+                normalization=cfg.get('normalization', 'layernorm'),
+                transformer_block_type=cfg.get('transformer_block_type', 'pre_ln'),
+                bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
+                bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True),
+                onnx_safe=cfg.get('onnx_safe', False),
+                add_binary_head=cfg.bert_binary_head,
+                skip_head=cfg.get('skip_head', False),
+                megatron_legacy=cfg.get('megatron_legacy', False),
+                position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"),
+            )
+
+        return model
+
+    def build_train_valid_test_datasets(self):
+
+        train_file_path = self.cfg.data.data_prefix
+
+        train_data = self.train_data
+
+        query_prefix = "query:"
+        passage_prefix = "passage:"
+        evaluation_sample_size = self.cfg.data.get("evaluation_sample_size", 100)
+        hard_negatives_to_train = self.cfg.data.get("hard_negatives_to_train", 4)
+        evaluation_steps = self.cfg.data.get("evaluation_steps", 100)
+
+        # TODO @ataghibakhsh: Handle valid and test datasets better
+
+        self._train_ds = None
+        self._validation_ds = None
+        self._test_ds = None
+
+        if train_file_path:  # we don't support calculating validation loss for multiple train files
+            valid_data = None
+            if evaluation_sample_size:
+                if evaluation_steps == 0:
+                    raise ValueError(
+                        "The --evaluation_steps should be greater than 0 " "when --evaluation_sample_size is set"
+                    )
+
+                if evaluation_sample_size >= len(train_data):
+                    raise ValueError("The --evaluation_sample_size cannot be greater " "than train set size.")
+
+                valid_data = train_data[-evaluation_sample_size:]
+                train_data = train_data[:-evaluation_sample_size]
+
+            if evaluation_sample_size:
+                self._validation_ds = BertEmbeddingDataset(
+                    valid_data,
+                    num_hard_negs=hard_negatives_to_train,
+                    query_prefix=query_prefix,
+                    passage_prefix=passage_prefix,
+                )
+
+        self._train_ds = BertEmbeddingDataset(
+            train_data, num_hard_negs=hard_negatives_to_train, query_prefix=query_prefix, passage_prefix=passage_prefix
+        )
+
+        if self._train_ds is not None:
+            logging.info(f'Length of train dataset: {len(self._train_ds)}')
+        if self._validation_ds is not None:
+            logging.info(f'Length of val dataset: {len(self._validation_ds)}')
+        if self._test_ds is not None:
+            logging.info(f'Length of test dataset: {len(self._test_ds)}')
+        logging.info(f'Finished building Bert datasets.')
+
+        return self._train_ds, self._validation_ds, self._test_ds
+
+    def setup(self, stage=None):
+        """ PTL hook that is executed after DDP spawns.
+            We setup datasets here as megatron datasets require DDP to instantiate.
+            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        Args:
+            stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
+        """
+
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
+            self.model
+        )
+
+        logging.info(
+            f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
+            f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, '
+            f'Number of model parameters on device: {num_parameters_on_device:.2e}. '
+            f'Total number of model parameters: {total_num_parameters:.2e}.'
+        )
+
+        resume_checkpoint_path = self.trainer.ckpt_path
+        if resume_checkpoint_path:
+            init_consumed_samples = self._extract_consumed_samples_from_ckpt(resume_checkpoint_path)
+        else:
+            init_consumed_samples = 0
+        self.init_consumed_samples = init_consumed_samples
+        self.init_global_step = self.trainer.global_step
+
+        if stage == 'predict':
+            return
+        else:
+            # TODO: consider adding a ModelPT guard to check if model is being restored.
+            # allowing restored models to optionally setup datasets
+            if self.cfg.data.dataloader_type == "LDDL":
+                self.build_LDDL_data(self.cfg.data)
+                torch.distributed.barrier()
+            else:
+                self.build_train_valid_test_datasets()
+                self.setup_training_data(self.cfg.data)
+                self.setup_validation_data(self.cfg.data)
+                # self.setup_test_data(self.cfg.data)
+
+        # when using pipeline model parallel the final stage need to initialize word embeddings
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            if isinstance(self.model, list):
+                for i, module in enumerate(self.model):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    sync_embeddings = (
+                        module.initialize_last_stage_with_word_embeddings
+                        if self.mcore_bert
+                        else module.sync_initial_word_embeddings
+                    )
+                    sync_embeddings()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+            else:
+                sync_embeddings = (
+                    self.model.initialize_last_stage_with_word_embeddings
+                    if self.mcore_bert
+                    else self.model.sync_initial_word_embeddings
+                )
+                sync_embeddings()
+
+        if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_bert', False):
+            self.setup_transformer_engine_tp_groups()
+
+    @classmethod
+    def merge_cfg_with(cls, path, cfg):
+        """
+        Merge a given configuration dictionary `cfg` with the configuration dictionary
+        obtained from restoring a MegatronBertModel at the specified `path`.
+
+        Args:
+            path (str): The path to the Bert model checkpoint to be restored.
+            cfg (DictConfig): The configuration dictionary to merge.
+
+        Returns:
+            DictConfig: The merged configuration dictionary.
+
+        Examples:
+            >>> path = "/path/to/model/checkpoint"
+            >>> cfg = DictConfig({"model": {"key": "value"}, "trainer": {"precision": 16}})
+            >>> merged_cfg = merge_cfg_with(path, cfg)
+
+        Notes:
+            - The function resolves variables within the `cfg` dictionary using `OmegaConf.resolve`.
+            - Keys in `cfg.model` will override the corresponding keys in the output dictionary.
+            - If "train_ds" exists in `cfg.model.data`, it updates `micro_batch_size` and `global_batch_size`.
+            - If `cfg.trainer` contains a "precision" key, it updates `output.precision`.
+
+        """
+
+        base_cfg = cls.restore_from(path, return_config=True)
+
+        OmegaConf.resolve(cfg)
+        with open_dict(base_cfg):
+            for key, val in cfg.model.items():
+                base_cfg[key] = val
+            if "train_ds" in cfg.model.data:
+                base_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
+                base_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
+            if cfg.get("trainer", None) and cfg.trainer.get("precision"):
+                base_cfg.precision = cfg.trainer.precision
+
+        return base_cfg
+
+    def build_pretraining_data_loader(self, dataset, consumed_samples):
+        """Buld dataloader given an input dataset."""
+
+        if dataset is None:
+            return None
+
+        # Megatron sampler
+        if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
+            if self.cfg.data.dataloader_type == 'single':
+                batch_sampler = MegatronPretrainingSampler(
+                    total_samples=len(dataset),
+                    consumed_samples=consumed_samples,
+                    micro_batch_size=self.cfg.micro_batch_size,
+                    global_batch_size=self.cfg.global_batch_size,
+                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                    drop_last=self.cfg.get('drop_last', True),
+                )
+            elif self.cfg.data.dataloader_type == 'cyclic':
+                batch_sampler = MegatronPretrainingRandomSampler(
+                    total_samples=len(dataset),
+                    consumed_samples=consumed_samples,
+                    micro_batch_size=self.cfg.micro_batch_size,
+                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                    drop_last=self.cfg.get('drop_last', True),
+                )
+            else:
+                raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"')
+        else:
+            raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"')
+
+        # Torch dataloader.
+
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            shuffle=False,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.data.num_workers,
+            pin_memory=True,
+            persistent_workers=True if self.cfg.data.num_workers > 0 else False,
+        )
+
+        dataloader.collate_fn = self.batching_collate
+
+        return dataloader
+
+    def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
+
+        max_seq_length = self.cfg.encoder_seq_length
+        do_lower_case = self.cfg.tokenizer.get("do_lower_case", False)
+        """
+        Tokenizes a text and maps tokens to token-ids
+        """
+        output = {}
+        if isinstance(texts[0], str):
+            to_tokenize = [texts]
+        elif isinstance(texts[0], dict):
+            to_tokenize = []
+            output["text_keys"] = []
+            for lookup in texts:
+                text_key, text = next(iter(lookup.items()))
+                to_tokenize.append(text)
+                output["text_keys"].append(text_key)
+            to_tokenize = [to_tokenize]
+        else:
+            batch1, batch2 = [], []
+            for text_tuple in texts:
+                batch1.append(text_tuple[0])
+                batch2.append(text_tuple[1])
+            to_tokenize = [batch1, batch2]
+
+        # strip
+        to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
+
+        # Lowercase
+        if do_lower_case:
+            to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
+
+        output.update(
+            self.tokenizer.tokenizer(
+                *to_tokenize, padding=True, truncation="longest_first", return_tensors="pt", max_length=max_seq_length,
+            )
+        )
+        return output
+
+    def batching_collate(self, batch):
+        """
+            Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
+            Here, batch is a list of InputExample instances: [InputExample(...), ...]
+
+            :param batch:
+                a batch from a SmartBatchingDataset
+            :return:
+                a batch of tensors for the model
+            """
+
+        sentence_features = [self.tokenize(sentence) for sentence in zip(*batch)]
+
+        return sentence_features
+
+    def get_forward_output_and_loss_func(self):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+
+            batches = next(dataloader_iter)
+
+            (
+                tokens_batch,
+                types_batch,
+                sentence_order_batch,
+                loss_mask_batch,
+                lm_labels_batch,
+                padding_mask_batch,
+            ) = ([], [], [], [], [], [])
+            for batch in batches:
+                tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = (
+                    batch['input_ids'].cuda(non_blocking=True),
+                    batch['token_type_ids'].cuda(non_blocking=True),
+                    None,
+                    None,
+                    None,
+                    batch['attention_mask'].cuda(non_blocking=True),
+                )
+                tokens_batch.append(tokens)
+                types_batch.append(types)
+                sentence_order_batch.append(sentence_order)
+                loss_mask_batch.append(loss_mask)
+                lm_labels_batch.append(lm_labels)
+                padding_mask_batch.append(padding_mask)
+
+            if not self.cfg.bert_binary_head:
+                types = None
+
+            forward_args = [
+                {"input_ids": tokens, "token_type_ids": types, "attention_mask": padding_mask}
+                for tokens, padding_mask, types in zip(tokens_batch, padding_mask_batch, types_batch)
+            ]
+
+            if self.mcore_bert:
+                raise Exception("mcore not supported at the moment. It will be added in the near future")
+            else:
+                output_tensor = [self.forward(**forward_arg).permute(1, 0) for forward_arg in forward_args]
+
+            def loss_func(output_tensor):
+
+                loss_dict = self.loss_func(output_tensor)
+
+                if 'sop loss' in loss_dict:
+                    lm_loss = loss_dict['lm loss']
+                    sop_loss = loss_dict['sop loss']
+                    loss = lm_loss + sop_loss
+                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss, sop_loss])
+                else:
+                    lm_loss = loss_dict['lm loss']
+                    loss = lm_loss
+                    reduced_loss = average_losses_across_data_parallel_group([loss, lm_loss])
+
+                return loss, {'loss': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def loss_func(self, output_tensor):
+        queries = output_tensor[0]  # shape (bs, embedding_dim)
+        positives = output_tensor[1]  # shape (bs, embedding_dim)
+
+        pos_inbatch_negs_scores = torch.mm(
+            queries, positives.transpose(0, 1)
+        )  # shape (bs, bs); each positive is negative for other queries.
+
+        hard_negs = output_tensor[2:]  # List of length "num_negatives", each tensor of shape (bs, embedding_dim)
+
+        hard_negs_scores = (
+            torch.multiply(queries.unsqueeze(0).repeat(len(hard_negs), 1, 1), torch.stack(hard_negs),).sum(axis=-1).T
+        )  # shape = (bs, num_negatives); Hard negatives are not shared between queries.
+
+        scores = torch.cat([pos_inbatch_negs_scores, hard_negs_scores], axis=1)
+
+        scores *= self.scale
+
+        labels = torch.tensor(
+            range(len(scores)), dtype=torch.long, device=scores.device
+        )  # Indices of the (query, positive) pairs
+
+        return {'lm loss': self.cross_entropy_loss(scores, labels)}

From 150784c771623f861e7b6c329f1cca2d3d6267de Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Fri, 16 Feb 2024 19:38:36 -0700
Subject: [PATCH 21/28] Update readme (#8440)

* update

Signed-off-by: eharper <eharper@nvidia.com>

* udpate

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* landing pages added

* landing page added for vision

* landing pages updated

* some minor changes to the main readme

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* typo fixed

* update

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: ntajbakhsh <ntajbakhsh@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 README.rst                                | 132 ++++++++--------------
 docs/source/index.rst                     |  86 +++++++-------
 docs/source/multimodal/api.rst            |   2 +-
 docs/source/nlp/api.rst                   |   4 +-
 docs/source/nlp/information_retrieval.rst |   2 +-
 docs/source/nlp/nemo_megatron/intro.rst   |  12 +-
 docs/source/starthere/intro.rst           |  15 ++-
 nemo/collections/asr/README.md            |  37 ++++++
 nemo/collections/multimodal/README.md     |  27 +++++
 nemo/collections/nlp/README.md            |  13 +++
 nemo/collections/tts/README.md            |   7 ++
 nemo/collections/vision/README.md         |   6 +
 12 files changed, 196 insertions(+), 147 deletions(-)
 create mode 100644 nemo/collections/asr/README.md
 create mode 100644 nemo/collections/multimodal/README.md
 create mode 100644 nemo/collections/nlp/README.md
 create mode 100644 nemo/collections/tts/README.md
 create mode 100644 nemo/collections/vision/README.md

diff --git a/README.rst b/README.rst
index 44e5df6b7488..3135bdbfabdd 100644
--- a/README.rst
+++ b/README.rst
@@ -35,7 +35,7 @@
 
 .. _main-readme:
 
-**NVIDIA NeMo**
+**NVIDIA NeMo Framework**
 ===============
 
 Latest News
@@ -57,92 +57,66 @@ such as FSDP, Mixture-of-Experts, and RLHF with TensorRT-LLM to provide speedups
 Introduction
 ------------
 
-NVIDIA NeMo is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR),
-text-to-speech synthesis (TTS), large language models (LLMs), and
-natural language processing (NLP).
-The primary objective of NeMo is to help researchers from industry and academia to reuse prior work (code and pretrained models)
-and make it easier to create new `conversational AI models <https://developer.nvidia.com/conversational-ai#started>`_.
+NVIDIA NeMo Framework is a generative AI framework built for researchers and pytorch developers 
+working on large language models (LLMs), multimodal models (MM), automatic speech recognition (ASR),
+and text-to-speech synthesis (TTS).
+The primary objective of NeMo is to provide a scalable framework for researchers and developers from industry and academia 
+to more easily implement and design new generative AI models by being able to leverage existing code and pretrained models.
+
+For technical documentation, please see the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_.
 
 All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and
 training is automatically scalable to 1000s of GPUs.
-Additionally, NeMo Megatron LLM models can be trained up to 1 trillion parameters using tensor and pipeline model parallelism.
-NeMo models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
+
+When applicable, NeMo models take advantage of the latest possible distributed training techniques, 
+including parallelism strategies such as 
+
+* data parallelism
+* tensor parallelism
+* pipeline model parallelism
+* fully sharded data parallelism (FSDP)
+* sequence parallelism
+* context parallelism
+* mixture-of-experts (MoE)
+
+and mixed precision training recipes with bfloat16 and FP8 training.
+
+NeMo's Transformer based LLM and Multimodal models leverage `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_ for FP8 training on NVIDIA Hopper GPUs
+and leverages `NVIDIA Megatron Core <https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core>`_ for scaling transformer model training.
+
+NeMo LLMs can be aligned with state of the art methods such as SteerLM, DPO and Reinforcement Learning from Human Feedback (RLHF), 
+see `NVIDIA NeMo Aligner <https://github.com/NVIDIA/NeMo-Aligner>`_ for more details.
+
+NeMo LLM and Multimodal models can be deployed and optimized with `NVIDIA Inference Microservices (Early Access) <https://developer.nvidia.com/nemo-microservices-early-access>`_.
+
+NeMo ASR and TTS models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
+
+For scaling NeMo LLM and Multimodal training on Slurm clusters or public clouds, please see the `NVIDIA Framework Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
+The NeMo Framework launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and Multimodal models and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
+which can be used to find the optimal model parallel configuration for training on a specific cluster. 
+To get started quickly with the NeMo Framework Launcher, please see the `NeMo Framework Playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_
+The NeMo Framework Launcher does not currently support ASR and TTS training but will soon.
 
 Getting started with NeMo is simple.
 State of the Art pretrained NeMo models are freely available on `HuggingFace Hub <https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia>`_ and
 `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
-These models can be used to transcribe audio, synthesize speech, or translate text in just a few lines of code.
+These models can be used to generate text or images, transcribe audio, and synthesize speech in just a few lines of code.
 
 We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that
-can be run on `Google Colab <https://colab.research.google.com>`_.
+can be run on `Google Colab <https://colab.research.google.com>`_ or with our `NGC NeMo Framework Container. <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo>`_
+and we have `playbooks <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html>`_ for users that want to train NeMo models with the NeMo Framework Launcher.
 
 For advanced users that want to train NeMo models from scratch or finetune existing NeMo models
 we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
 
-For scaling NeMo LLM training on Slurm clusters or public clouds, please see the `NVIDIA NeMo Megatron Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
-The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
-which can be used to find the optimal model parallel configuration for training on a specific cluster.
-
 Key Features
 ------------
 
-* Speech processing
-    * `HuggingFace Space for Audio Transcription (File, Microphone and YouTube) <https://huggingface.co/spaces/smajumdar/nemo_multilingual_language_id>`_
-    * `Pretrained models <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr>`_ available in 14+ languages
-    * `Automatic Speech Recognition (ASR) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/intro.html>`_
-        * Supported ASR `models <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html>`_:
-            * Jasper, QuartzNet, CitriNet, ContextNet
-            * Conformer-CTC, Conformer-Transducer, FastConformer-CTC, FastConformer-Transducer
-            * Squeezeformer-CTC and Squeezeformer-Transducer
-            * LSTM-Transducer (RNNT) and LSTM-CTC
-        * Supports the following decoders/losses:
-            * CTC
-            * Transducer/RNNT
-            * Hybrid Transducer/CTC
-            * NeMo Original `Multi-blank Transducers <https://arxiv.org/abs/2211.03541>`_ and `Token-and-Duration Transducers (TDT) <https://arxiv.org/abs/2304.06795>`_
-        * Streaming/Buffered ASR (CTC/Transducer) - `Chunked Inference Examples <https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/asr_chunked_inference>`_
-        * `Cache-aware Streaming Conformer <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer>`_ with multiple lookaheads (including microphone streaming `tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb>`_).
-        * Beam Search decoding
-        * `Language Modelling for ASR (CTC and RNNT) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html>`_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer
-        * `Support of long audios for Conformer with memory efficient local attention <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html#inference-on-long-audio>`_
-    * `Speech Classification, Speech Command Recognition and Language Identification <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/intro.html>`_: MatchboxNet (Command Recognition), AmberNet (LangID)
-    * `Voice activity Detection (VAD) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad>`_: MarbleNet
-        * ASR with VAD Inference - `Example <https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/asr_vad>`_
-    * `Speaker Recognition <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html>`_: TitaNet, ECAPA_TDNN, SpeakerNet
-    * `Speaker Diarization <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html>`_
-        * Clustering Diarizer: TitaNet, ECAPA_TDNN, SpeakerNet
-        * Neural Diarizer: MSDD (Multi-scale Diarization Decoder)
-    * `Speech Intent Detection and Slot Filling <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_intent_slot/intro.html>`_: Conformer-Transformer
-* Natural Language Processing
-    * `NeMo Megatron pre-training of Large Language Models <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/nemo_megatron/intro.html>`_
-    * `Neural Machine Translation (NMT) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/machine_translation/machine_translation.html>`_
-    * `Punctuation and Capitalization <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html>`_
-    * `Token classification (named entity recognition) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/token_classification.html>`_
-    * `Text classification <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_classification.html>`_
-    * `Joint Intent and Slot Classification <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/joint_intent_slot.html>`_
-    * `Question answering <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/question_answering.html>`_
-    * `GLUE benchmark <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/glue_benchmark.html>`_
-    * `Information retrieval <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/information_retrieval.html>`_
-    * `Entity Linking <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/entity_linking.html>`_
-    * `Dialogue State Tracking <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/nlp/dialogue.html>`_
-    * `Prompt Learning <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/prompt_learning.html>`_
-    * `NGC collection of pre-trained NLP models. <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_nlp>`_
-    * `Synthetic Tabular Data Generation <https://developer.nvidia.com/blog/generating-synthetic-data-with-transformers-a-solution-for-enterprise-data-challenges/>`_
-* Text-to-Speech Synthesis (TTS):
-    * `Documentation <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/intro.html#>`_
-    * Mel-Spectrogram generators: FastPitch, SSL FastPitch, Mixer-TTS/Mixer-TTS-X, RAD-TTS, Tacotron2
-    * Vocoders: HiFiGAN, UnivNet, WaveGlow
-    * End-to-End Models: VITS
-    * `Pre-trained Model Checkpoints in NVIDIA GPU Cloud (NGC) <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_tts>`_
-* `Tools <https://github.com/NVIDIA/NeMo/tree/stable/tools>`_
-    * `Text Processing (text normalization and inverse text normalization) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_normalization/intro.html>`_
-    * `NeMo Forced Aligner <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/nemo_forced_aligner.html>`_
-    * `CTC-Segmentation tool <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/ctc_segmentation.html>`_
-    * `Speech Data Explorer <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html>`_: a dash-based tool for interactive exploration of ASR/TTS datasets
-    * `Speech Data Processor <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/tools/speech_data_processor.html>`_
-
-
-Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training to multiple GPUs and multiple nodes.
+* `Large Language Models <nemo/collections/nlp/README.md>`_
+* `Multimodal <nemo/collections/multimodal/README.md>`_
+* `Automatic Speech Recognition <nemo/collections/asr/README.md>`_
+* `Text to Speech <nemo/collections/tts/README.md>`_
+* `Computer Vision <nemo/collections/vision/README.md>`_
 
 Requirements
 ------------
@@ -151,8 +125,8 @@ Requirements
 2) Pytorch 1.13.1 or above
 3) NVIDIA GPU, if you intend to do model training
 
-Documentation
--------------
+Developer Documentation
+-----------------------
 
 .. |main| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=main
   :alt: Documentation Status
@@ -172,18 +146,6 @@ Documentation
 | Stable  | |stable|    | `Documentation of the stable (i.e. most recent release) branch. <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/>`_ |
 +---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+
 
-Tutorials
----------
-A great way to start with NeMo is by checking `one of our tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_.
-
-You can also get a high-level overview of NeMo by watching the talk *NVIDIA NeMo: Toolkit for Conversational AI*, presented at PyData Yerevan 2022:
-
-|pydata|
-
-.. |pydata| image:: https://img.youtube.com/vi/J-P6Sczmas8/maxres3.jpg
-    :target: https://www.youtube.com/embed/J-P6Sczmas8?mute=0&start=14&autoplay=0
-    :width: 600
-    :alt: NeMo presentation at PyData@Yerevan 2022
 
 Getting help with NeMo
 ----------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7407886eefc8..9d66d693000e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,5 +1,5 @@
-NVIDIA NeMo User Guide
-======================
+NVIDIA NeMo Framework Developer Docs
+====================================
 
 .. toctree::
    :maxdepth: 2
@@ -12,18 +12,28 @@ NVIDIA NeMo User Guide
    starthere/migration-guide
 
 .. toctree::
-   :maxdepth: 2
-   :caption: NeMo Core
-   :name: core
+   :maxdepth: 3
+   :caption: Multimodal (MM)
+   :name: Multimodal
 
-   core/core
-   core/exp_manager
-   core/neural_types
-   core/export
-   core/adapters/intro
-   core/api
+   multimodal/mllm/intro
+   multimodal/vlm/intro
+   multimodal/text2img/intro
+   multimodal/nerf/intro
+   multimodal/api
 
 
+.. toctree::
+   :maxdepth: 3
+   :caption: Large Language Models (LLMs)
+   :name: Large Language Models
+
+   nlp/nemo_megatron/intro
+   nlp/models
+   nlp/machine_translation/machine_translation
+   nlp/megatron_onnx_export
+   nlp/api
+
 .. toctree::
    :maxdepth: 2
    :caption: Speech Processing
@@ -36,19 +46,6 @@ NVIDIA NeMo User Guide
    asr/ssl/intro
    asr/speech_intent_slot/intro
 
-.. toctree::
-   :maxdepth: 3
-   :caption: Natural Language Processing
-   :name: Natural Language Processing
-
-   nlp/nemo_megatron/intro
-   nlp/machine_translation/machine_translation
-   nlp/text_normalization/intro
-   nlp/api
-   nlp/megatron_onnx_export
-   nlp/models
-
-
 .. toctree::
    :maxdepth: 1
    :caption: Text To Speech (TTS)
@@ -56,6 +53,26 @@ NVIDIA NeMo User Guide
 
    tts/intro
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Vision
+   :name: vision
+
+   vision/intro
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: NeMo Core
+   :name: core
+
+   core/core
+   core/exp_manager
+   core/neural_types
+   core/export
+   core/adapters/intro
+   core/api
+
 .. toctree::
    :maxdepth: 2
    :caption: Common
@@ -71,27 +88,10 @@ NVIDIA NeMo User Guide
    text_processing/g2p/g2p
    common/intro
 
-.. toctree::
-   :maxdepth: 3
-   :caption: Multimodal (MM)
-   :name: Multimodal
-
-   multimodal/mllm/intro
-   multimodal/vlm/intro
-   multimodal/text2img/intro
-   multimodal/nerf/intro
-   multimodal/api
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Vision
-   :name: vision
-
-   vision/intro
 
 .. toctree::
    :maxdepth: 3
-   :caption: Tools
-   :name: Tools
+   :caption: Speech Tools
+   :name: Speech Tools
 
    tools/intro
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
index 63ce477273b3..d6f96e6c6ea4 100644
--- a/docs/source/multimodal/api.rst
+++ b/docs/source/multimodal/api.rst
@@ -1,4 +1,4 @@
-NeMo Megatron API
+Multimodal API
 =======================
 
 Model Classes
diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst
index 33709bd05a19..b9b4d529ba46 100755
--- a/docs/source/nlp/api.rst
+++ b/docs/source/nlp/api.rst
@@ -1,5 +1,5 @@
-NeMo Megatron API
-=======================
+Large language Model API
+========================
 
 Pretraining Model Classes
 -------------------------
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index 5cf87143848c..b40caeee8a3b 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -8,7 +8,7 @@ The model architecture and pre-training process are detailed in the `Sentence-BE
 Sentence-BERT utilizes a BERT-based architecture, but it is trained using a siamese and triplet network structure to derive fixed-sized sentence embeddings that capture semantic information. 
 Sentence-BERT is commonly used to generate high-quality sentence embeddings for various downstream natural language processing tasks, such as semantic textual similarity, clustering, and information retrieval
 
-Data Input for the Senntence-BERT model
+Data Input for the Sentence-BERT model
 ---------------------------------------
 
 The fine-tuning data for the Sentence-BERT (SBERT) model should consist of data instances, 
diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
index 80b30a267b18..faf315a40c04 100644
--- a/docs/source/nlp/nemo_megatron/intro.rst
+++ b/docs/source/nlp/nemo_megatron/intro.rst
@@ -1,8 +1,7 @@
-NeMo Megatron
-=============
+Large Language Models
+=====================
 
-Megatron :cite:`nlp-megatron-shoeybi2019megatron` is a large, powerful transformer developed by the Applied Deep Learning Research 
-team at NVIDIA. NeMo Megatron supports several types of models:
+To learn more about using NeMo to train Large Language Models at scale, please refer to the `NeMo Framework User Guide! <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_.
 
 * GPT-style models (decoder only)
 * T5/BART/UL2-style models (encoder-decoder)
@@ -10,11 +9,6 @@ team at NVIDIA. NeMo Megatron supports several types of models:
 * RETRO model (decoder only)
 
 
-
-.. note::
-    NeMo Megatron has an Enterprise edition which contains tools for data preprocessing, hyperparameter tuning, container, scripts for various clouds and more. With Enterprise edition you also get deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
-
-
 .. toctree::
    :maxdepth: 1
 
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index e6a59b0832ab..185350bad3ab 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -8,14 +8,17 @@ Introduction
 
 .. _dummy_header:
 
-`NVIDIA NeMo <https://github.com/NVIDIA/NeMo>`_, part of the NVIDIA AI platform, is a toolkit for building new state-of-the-art
-conversational AI models. NeMo has separate collections for Automatic Speech Recognition (ASR),
-Natural Language Processing (NLP), and Text-to-Speech (TTS) models. Each collection consists of
+NVIDIA NeMo Framework is an end-to-end, cloud-native framework to build, customize, and deploy generative AI models anywhere. 
+To learn more about using NeMo in generative AI workflows, please refer to the `NeMo Framework User Guide! <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_
+
+`NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for Large Language Models (LLMs), 
+Multimodal (MM), Computer Vision (CV), Automatic Speech Recognition (ASR), 
+and Text-to-Speech (TTS) models. Each collection consists of
 prebuilt modules that include everything needed to train on your data.
-Every module can easily be customized, extended, and composed to create new conversational AI
+Every module can easily be customized, extended, and composed to create new generative AI
 model architectures.
 
-Conversational AI architectures are typically large and require a lot of data and compute
+Generative AI architectures are typically large and require a lot of data and compute
 for training. NeMo uses `PyTorch Lightning <https://www.pytorchlightning.ai/>`_ for easy and performant multi-GPU/multi-node
 mixed-precision training.
 
@@ -38,7 +41,7 @@ Before you begin using NeMo, it's assumed you meet the following prerequisites.
 Quick Start Guide
 -----------------
 
-You can try out NeMo's ASR, NLP and TTS functionality with the example below, which is based on the `Audio Translation <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/AudioTranslationSample.ipynb>`_ tutorial.
+You can try out NeMo's ASR, LLM and TTS functionality with the example below, which is based on the `Audio Translation <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/AudioTranslationSample.ipynb>`_ tutorial.
 
 Once you have :ref:`installed NeMo <installation>`, then you can run the code below:
 
diff --git a/nemo/collections/asr/README.md b/nemo/collections/asr/README.md
new file mode 100644
index 000000000000..9a1b947f2d18
--- /dev/null
+++ b/nemo/collections/asr/README.md
@@ -0,0 +1,37 @@
+# Automatic Speech Recognition (ASR)
+
+## Key Features
+
+* [HuggingFace Space for Audio Transcription (File, Microphone and YouTube)](https://huggingface.co/spaces/smajumdar/nemo_multilingual_language_id)
+* [Pretrained models](https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr) available in 14+ languages
+* [Automatic Speech Recognition (ASR)](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/intro.html)
+    * Supported ASR [models](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html):
+        * Jasper, QuartzNet, CitriNet, ContextNet
+        * Conformer-CTC, Conformer-Transducer, FastConformer-CTC, FastConformer-Transducer
+        * Squeezeformer-CTC and Squeezeformer-Transducer
+        * LSTM-Transducer (RNNT) and LSTM-CTC
+    * Supports the following decoders/losses:
+        * CTC
+        * Transducer/RNNT
+        * Hybrid Transducer/CTC
+        * NeMo Original [Multi-blank Transducers](https://arxiv.org/abs/2211.03541) and [Token-and-Duration Transducers (TDT)](https://arxiv.org/abs/2304.06795)
+    * Streaming/Buffered ASR (CTC/Transducer) - [Chunked Inference Examples](https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/asr_chunked_inference)
+    * [Cache-aware Streaming Conformer](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer) with multiple lookaheads (including microphone streaming [tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb).
+    * Beam Search decoding
+    * [Language Modelling for ASR (CTC and RNNT)](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html): N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer
+    * [Support of long audios for Conformer with memory efficient local attention](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html#inference-on-long-audio)
+* [Speech Classification, Speech Command Recognition and Language Identification](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/intro.html): MatchboxNet (Command Recognition), AmberNet (LangID)
+* [Voice activity Detection (VAD)](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad): MarbleNet
+    * ASR with VAD Inference - [Example](https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/asr_vad)
+* [Speaker Recognition](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html): TitaNet, ECAPA_TDNN, SpeakerNet
+* [Speaker Diarization](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html)
+    * Clustering Diarizer: TitaNet, ECAPA_TDNN, SpeakerNet
+    * Neural Diarizer: MSDD (Multi-scale Diarization Decoder)
+* [Speech Intent Detection and Slot Filling](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_intent_slot/intro.html): Conformer-Transformer
+
+You can also get a high-level overview of NeMo ASR by watching the talk *NVIDIA NeMo: Toolkit for Conversational AI*, presented at PyData Yerevan 2022:
+
+
+[![NVIDIA NeMo: Toolkit for Conversational AI](https://img.youtube.com/vi/J-P6Sczmas8/maxres3.jpg
+)](https://www.youtube.com/embed/J-P6Sczmas8?mute=0&start=14&autoplay=0
+ "NeMo presentation at PyData@Yerevan 2022")
diff --git a/nemo/collections/multimodal/README.md b/nemo/collections/multimodal/README.md
new file mode 100644
index 000000000000..c160ac89569d
--- /dev/null
+++ b/nemo/collections/multimodal/README.md
@@ -0,0 +1,27 @@
+NeMo Multimodal Collections
+============================
+
+The NeMo Multimodal Collection supports a diverse range of multimodal models tailored for various tasks, including text-2-image generation, text-2-NeRF synthesis, multimodal language models (LLM), and foundational vision and language models. Leveraging existing modules from other NeMo collections such as LLM and Vision whenever feasible, our multimodal collections prioritize efficiency by avoiding redundant implementations and maximizing reuse of NeMo's existing modules. Here's a detailed list of the models currently supported within the multimodal collection:
+
+- **Foundation Vision-Language Models:**
+  - CLIP
+  
+- **Foundation Text-to-Image Generation:**
+  - Stable Diffusion
+  - Imagen
+  
+- **Customizable Text-to-Image Models:**
+  - SD-LoRA
+  - SD-ControlNet
+  - SD-Instruct pix2pix
+  
+- **Multimodal Language Models:**
+  - NeVA
+  - LLAVA
+  
+- **Text-to-NeRF Synthesis:**
+  - DreamFusion++
+  
+- **NSFW Detection Support**
+
+Our [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html) offers comprehensive insights into each supported model, facilitating seamless integration and utilization within your projects.
diff --git a/nemo/collections/nlp/README.md b/nemo/collections/nlp/README.md
new file mode 100644
index 000000000000..fc6644d28293
--- /dev/null
+++ b/nemo/collections/nlp/README.md
@@ -0,0 +1,13 @@
+NeMo NLP/LLM Collection
+========================
+
+The NeMo NLP/LLM Collection is designed to provide comprehensive support for on-demand large language community models as well as Nvidia's top LLM offerings. By harnessing the cutting-edge Megatron Core, our LLM collection is highly optimized, empowering NeMo users to undertake foundation model training across thousands of GPUs while facilitating fine-tuning of LLMs using techniques such as SFT and PEFT. Leveraging the Transformer Engine library, our collection ensures seamless support for FP8 workloads on Hopper H100 GPUs. Additionally, we prioritize supporting TRTLLM export for the released models, which can accelerate inference by 2-3x depending on the model size. Here's a detailed list of the models currently supported within the LLM collection:
+
+- **Bert**
+- **GPT-style models**
+- **Falcon**
+- **code-llama 7B**
+- **Mistral**
+- **Mixtral**
+
+Our [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html) offers comprehensive insights into each supported model, facilitating seamless integration and utilization within your projects.
diff --git a/nemo/collections/tts/README.md b/nemo/collections/tts/README.md
new file mode 100644
index 000000000000..44b2b1b7a25c
--- /dev/null
+++ b/nemo/collections/tts/README.md
@@ -0,0 +1,7 @@
+# Text-to-Speech Synthesis (TTS):
+
+* [Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/intro.html#)
+* Mel-Spectrogram generators: FastPitch, SSL FastPitch, Mixer-TTS/Mixer-TTS-X, RAD-TTS, Tacotron2
+* Vocoders: HiFiGAN, UnivNet, WaveGlow
+* End-to-End Models: VITS
+* [Pre-trained Model Checkpoints in NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com/catalog/collections/nvidia:nemo_tts)
\ No newline at end of file
diff --git a/nemo/collections/vision/README.md b/nemo/collections/vision/README.md
new file mode 100644
index 000000000000..057f5b3a4719
--- /dev/null
+++ b/nemo/collections/vision/README.md
@@ -0,0 +1,6 @@
+NeMo Vision Collection
+========================
+
+The NeMo Vision Collection is designed to support the multimodal collection, particularly for models like LLAVA that necessitate a vision encoder implementation. At present, the vision collection features support for ViT, a customized version of the transformer model from Megatron core.
+
+Our [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html) offers comprehensive insights into each supported model, facilitating seamless integration and utilization within your projects.
\ No newline at end of file

From 5406d910ab52e95bee97ef1c5f0b79339e4d752f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 20 Feb 2024 09:37:41 -0700
Subject: [PATCH 22/28] NeMo-Mistral to HF converter bugfix. (#8353) (#8442)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py b/scripts/nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py
index 9e6403acd6c5..e78567d60554 100644
--- a/scripts/nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py
+++ b/scripts/nlp_language_modeling/convert_nemo_mistral_7b_to_hf.py
@@ -114,6 +114,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
         embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
     state_dict[hf_embed_weight_name] = param_to_weights(ckpt[embed_weights_base_name])
 
+    head_num = model.cfg.num_attention_heads
     if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
         num_query_groups = head_num
     else:
@@ -123,7 +124,6 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
         assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
 
     hidden_size = model.cfg.hidden_size
-    head_num = model.cfg.num_attention_heads
     num_layers = model.cfg.num_layers
     num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
 
@@ -191,7 +191,7 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
 
         hf_post_attn_ln_weight_name = f'model.layers.{l}.post_attention_layernorm.weight'
         if mcore_gpt:
-            post_attn_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
+            post_attn_ln_base_name = f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'
         else:
             post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
         state_dict[hf_post_attn_ln_weight_name] = param_to_weights(ckpt[post_attn_ln_base_name])

From b810c8c335e95716e0b92271ad492c34ab9f5197 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 20 Feb 2024 16:09:05 -0700
Subject: [PATCH 23/28] Fixing mcore bert for TP, PP and SP (#8336) (#8443)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile

* Update Jenkinsfile

* Update Jenkinsfile

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../language_modeling/megatron_bert_model.py  | 133 +++++++++++++-----
 1 file changed, 96 insertions(+), 37 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index bef13367eb10..49b64268e6b9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 import itertools
-from typing import Any, Dict, List, Optional
+import queue
+from typing import Any, Dict, Iterator, List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -343,8 +344,8 @@ def training_step(self, dataloader_iter, batch_idx):
 
         losses_reduced_per_micro_batch = fwd_bwd_function(
             forward_step_func=self.get_forward_output_and_loss_func(),
-            data_iterator=dataloader_iter,
-            model=[self.model],
+            data_iterator=self._make_data_iterator_list(dataloader_iter),
+            model=self.model,
             num_microbatches=get_num_microbatches(),
             forward_only=False,
             seq_length=seq_length,
@@ -405,6 +406,65 @@ def training_step(self, dataloader_iter, batch_idx):
 
         return loss_mean[0]
 
+    def _make_data_iterator_list(self, data_iterator: Iterator) -> List[Iterator]:
+        """ Convert data iterator into form expected by Megatron
+            With interleaved pipeline parallelism, Megatron expects a
+            list of one data iterator per model chunk. Each model
+            chunk independently gets data from its data iterator, so
+            we need to interact with the data iterator multiple times
+            for each microbatch step. Instead of incorporating this
+            logic into the data loader, we cache the iterator's output
+            to the first model chunk and reuse it in the other model
+            chunks.
+        """
+
+        if not isinstance(self.model, list) or len(self.model) == 1:
+            return data_iterator  # TODO @tmoon: Remove
+            # TODO @tmoon: Use once available in Megatron-LM
+            # return DataIteratorList([data_iterator])
+
+        class CachingIterator:
+            """Iterator wrapper that caches values"""
+
+            class Proxy:
+                """Returns values from caching iterator wrapper
+                Assumed to never advance past the caching iterator.
+                """
+
+                def __init__(self):
+                    self.cache = queue.Queue()
+
+                def __iter__(self):
+                    return self
+
+                def __next__(self):
+                    return self.cache.get_nowait()
+
+            def __init__(self, iterator: Iterator):
+                self.iterator = iterator
+                self.proxies = []
+
+            def make_proxy(self):
+                self.proxies.append(CachingIterator.Proxy())
+                return self.proxies[-1]
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                val = next(self.iterator)
+                for proxy in self.proxies:
+                    proxy.cache.put(val)
+                return val
+
+        # Make list of iterator wrappers
+        iters = [CachingIterator(data_iterator)]
+        while len(iters) < len(self.model):
+            iters.append(iters[0].make_proxy())
+        return iters  # TODO @tmoon: Remove
+        # TODO @tmoon: Use once available in Megatron-LM
+        # return DataIteratorList(iters)
+
     def allreduce_first_last_embeddings(self):
 
         # Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/training.py#L407
@@ -416,17 +476,16 @@ def allreduce_first_last_embeddings(self):
             parallel_state.is_pipeline_first_stage(ignore_virtual=True)
             or parallel_state.is_pipeline_last_stage(ignore_virtual=True)
         ):
+            module_list = self.get_model_module_list()
             if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                if isinstance(self.model, list):
-                    module = self.model[0]  # only the first virtual rank has the embeddings
-                else:
-                    module = self.model
-            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                if isinstance(self.model, list):
-                    module = self.model[-1]  # only the last virtual rank has the embeddings
-                else:
-                    module = self.model
-            if module.share_token_embeddings:
+                module = module_list[0]
+            elif parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                module = module_list[-1]
+
+            share_embeddings = (
+                module.share_embeddings_and_output_weights if self.mcore_bert else module.share_token_embeddings
+            )
+            if share_embeddings:
                 word_embeddings_weight = (
                     module.shared_embedding_or_output_weight() if self.mcore_bert else module.word_embeddings_weight()
                 )
@@ -453,8 +512,8 @@ def validation_step(self, dataloader_iter, batch_idx):
 
         losses_reduced_per_micro_batch = fwd_bwd_function(
             forward_step_func=self.get_forward_output_and_loss_func(),
-            data_iterator=dataloader_iter,
-            model=[self.model],
+            data_iterator=self._make_data_iterator_list(dataloader_iter),
+            model=self.model,
             num_microbatches=get_num_microbatches(),
             forward_only=True,
             seq_length=seq_length,
@@ -727,23 +786,17 @@ def setup(self, stage=None):
 
         # when using pipeline model parallel the final stage need to initialize word embeddings
         if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-            if isinstance(self.model, list):
-                for i, module in enumerate(self.model):
-                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+            for index, module in enumerate(self.get_model_module_list()):
+                if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(index)
                     sync_embeddings = (
                         module.initialize_last_stage_with_word_embeddings
                         if self.mcore_bert
                         else module.sync_initial_word_embeddings
                     )
                     sync_embeddings()
-                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
-            else:
-                sync_embeddings = (
-                    self.model.initialize_last_stage_with_word_embeddings
-                    if self.mcore_bert
-                    else self.model.sync_initial_word_embeddings
-                )
-                sync_embeddings()
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
         if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_bert', False):
             self.setup_transformer_engine_tp_groups()
@@ -917,22 +970,28 @@ def configure_optimizers(self):
             # Disable overlapped grad sync for embedding grad when
             # pipeline parallelism is enabled
             if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                modules = self.get_model_module_list()
                 if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                    if isinstance(self.model, list):
-                        module = self.model[0]  # only the first virtual rank has the embeddings
-                    else:
-                        module = self.model
-                    if module.share_token_embeddings:
-                        param = module.word_embeddings_weight()
+                    module = modules[0]  # only the first virtual rank has the embeddings
+                    if self.cfg.get('share_embeddings_and_output_weights', True):
+                        param = (
+                            module.shared_embedding_or_output_weight()
+                            if self.mcore_bert
+                            else module.word_embeddings_weight()
+                        )
                         param._disable_greedy_grad_copy = not self.megatron_amp_O2
                         param._disable_overlap_grad_sync = True
                 if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                    if isinstance(self.model, list):
-                        module = self.model[-1]  # only the last virtual rank has the embeddings
+                    if len(modules) > 1:
+                        module = modules[-1]  # only the last virtual rank has the embeddings
                     else:
-                        module = self.model
-                    if module.share_token_embeddings:
-                        param = module.word_embeddings_weight()
+                        module = modules[0]
+                    if self.cfg.get('share_embeddings_and_output_weights', True):
+                        param = (
+                            module.shared_embedding_or_output_weight()
+                            if self.mcore_bert
+                            else module.word_embeddings_weight()
+                        )
                         param._disable_greedy_grad_copy = not self.megatron_amp_O2
                         param._disable_overlap_grad_sync = True
 

From 76b9eea81e7a3577ff02792d213ac7433096b136 Mon Sep 17 00:00:00 2001
From: Tugrul Konuk <ertkonuk@gmail.com>
Date: Wed, 21 Feb 2024 00:29:07 -0600
Subject: [PATCH 24/28] Add LoRA support to all linear layers (#7988)

* Added LoRA support for the Dense layer of Attention

* Added LoRA MLP support to MCore and NeMo models.

* Change LoRA config default to QKV.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed bug with ddp training.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* MCoreMixin chages.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* using new commit of meg-LM

Signed-off-by: arendu <adithya.r@gmail.com>

* add cpu_offloading_num_layers to conversion script until bug in megatron is fixed

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix peft mixin arguments to follow mcore 0.5

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update megatron commit to fix ci error

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* try to fix ci

Signed-off-by: Chen Cui <chcui@nvidia.com>

* add cfg default

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Adi Renduchintala <adithyare@nvidia.com>
Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithyare@nvidia.com>
Co-authored-by: Jiaqi Zeng <jiaqiz@nvidia.com>
Co-authored-by: arendu <adithya.r@gmail.com>
Co-authored-by: HeyyyyyyG <49757268+HeyyyyyyG@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 Jenkinsfile                                   |  11 ++
 .../conf/megatron_gpt_finetuning_config.yaml  |   1 +
 .../common/megatron/adapters/mcore_mixins.py  | 171 +++++++++++++++---
 .../megatron/adapters/parallel_adapters.py    |  79 +++++++-
 .../nlp/modules/common/megatron/attention.py  |   7 +
 .../nlp/modules/common/megatron/mlp.py        |  16 +-
 nemo/collections/nlp/parts/peft_config.py     | 114 ++++++++++--
 .../convert_starcoder_hf_to_nemo.py           |   1 +
 8 files changed, 351 insertions(+), 49 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 957b69e13c17..5d81a57c04c9 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3470,6 +3470,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/token_classification_results"
       }
     }
+    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
     stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
       when {
         anyOf {
@@ -3506,6 +3507,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method='block' \
@@ -3541,6 +3543,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method='block' \
@@ -3590,6 +3593,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
        model.num_layers=8 \
+       model.cpu_offloading_num_layers=7 \
        model.hidden_size=256 \
        model.num_attention_heads=8 \
        model.activations_checkpoint_method='block' \
@@ -3731,6 +3735,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     //     sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
     //   }
     // }
+    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
     stage('L2: Megatron GPT with ALiBi Pretraining and Resume Training TP=2') {
       when {
         anyOf {
@@ -3768,6 +3773,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method='block' \
@@ -3816,6 +3822,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
       }
     }
+    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
     stage('L2: Megatron GPT with KERPLE Pretraining and Resume Training TP=2') {
       when {
         anyOf {
@@ -3853,6 +3860,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method='block' \
@@ -3901,6 +3909,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
       }
     }
+    // @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
     stage('L2: Megatron GPT Pretraining and Resume Training PP=2') {
       when {
         anyOf {
@@ -3941,6 +3950,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method='block' \
@@ -3979,6 +3989,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
         model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
         model.num_layers=8 \
+        model.cpu_offloading_num_layers=7 \
         model.hidden_size=256 \
         model.num_attention_heads=8 \
         model.activations_checkpoint_method='block' \
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index c381582aba45..af561ffe0aad 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -94,6 +94,7 @@ model:
       position_embedding_strategy: null # used only when weight_tying is True
 
     lora_tuning:
+      target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
       adapter_dropout: 0.0
       column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index aa1896801c03..3d355255850a 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -17,8 +17,10 @@
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
 from megatron.core.transformer.attention import SelfAttention
 from megatron.core.transformer.custom_layers.transformer_engine import (
+    SplitAlongDim,
     TEColumnParallelLinear,
     TELayerNormColumnParallelLinear,
 )
@@ -29,6 +31,9 @@
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
     InfusedAdapterConfig,
+    Lora4HtoHAdapterConfig,
+    LoraDenseAttentionAdapterConfig,
+    LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
     MLPInfusedAdapterConfig,
     ParallelLinearAdapterConfig,
@@ -59,7 +64,9 @@ def mcore_register_adapters(self):
         """
         Setup NeMo LoRA or IA3 adapter to this MCore layer.
         """
-        self.set_accepted_adapter_types([LoraKQVAdapterConfig._target_, InfusedAdapterConfig._target_])
+        self.set_accepted_adapter_types(
+            [LoraKQVAdapterConfig._target_, LoraDenseAttentionAdapterConfig._target_, InfusedAdapterConfig._target_]
+        )
         self.linear_qkv.return_layernorm_output = True  # need layernorm output for lora mlp
 
     def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
@@ -106,19 +113,25 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
         mixed_qkv = mixed_qkv.view(*new_tensor_shape)
 
         # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-        (query, key, value) = torch.split(
-            mixed_qkv,
-            [
-                (
-                    self.num_attention_heads_per_partition
-                    // self.num_query_groups_per_partition
-                    * self.hidden_size_per_attention_head
-                ),
-                self.hidden_size_per_attention_head,
-                self.hidden_size_per_attention_head,
-            ],
-            dim=3,
-        )
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+
+        if SplitAlongDim is not None:
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list,)
+        else:
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3,)
+
         # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
         query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
 
@@ -136,33 +149,143 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
 
         return query, key, value
 
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states)
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(query, q_pos_emb, config=self.config, cu_seqlens=cu_seqlens_q)
+            key = apply_rotary_pos_emb(key, k_pos_emb, config=self.config, cu_seqlens=cu_seqlens_kv)
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        if self.checkpoint_core_attention:
+            core_attn_out = self._checkpointed_attention_forward(
+                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query, key, value, attention_mask, attn_mask_type=attn_mask_type, packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.linear_proj(core_attn_out)
+        # LoRA logic
+        if self.is_adapter_available():
+            lora_linear_proj_adapter = self.get_adapter_module(AdapterName.LORA_DENSE_ATTENTION_ADAPTER)
+            if lora_linear_proj_adapter:
+                lora_output = lora_linear_proj_adapter(core_attn_out)
+                output = output + lora_output
+
+        return output, bias
+
 
 class MCoreMLPMixin(MLP, MCoreAdapterModuleMixin):
     def mcore_register_adapters(self):
         """
         Setup NeMo IA3 adapter to this MCore layer.
         """
-        self.set_accepted_adapter_types([MLPInfusedAdapterConfig._target_])  # only self attn (packed qkv) for now
+        self.set_accepted_adapter_types(
+            [LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_]
+        )  # only self attn (packed qkv) for now
 
     def forward(self, hidden_states):
         # [s, b, 4 * h/p]
         intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
 
-        if self.config.bias_gelu_fusion:
-            assert self.config.add_bias_linear is True
-            assert self.activation_func == F.gelu
-            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+        # LoRA logic
+        if self.is_adapter_available():
+            lora_linear_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
+            if lora_linear_fc1_adapter:
+                lora_output = lora_linear_fc1_adapter(hidden_states)
+                intermediate_parallel = intermediate_parallel + lora_output
+
+        if self.config.bias_activation_fusion:
+            if self.activation_func == F.gelu:
+                assert self.config.add_bias_linear is True
+                intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+            elif self.activation_func == F.silu and self.config.gated_linear_unit:
+                intermediate_parallel = bias_swiglu_impl(intermediate_parallel, bias_parallel)
+            else:
+                raise ValueError("Only support fusion of gelu and swiglu")
         else:
             if bias_parallel is not None:
                 intermediate_parallel = intermediate_parallel + bias_parallel
-            intermediate_parallel = self.activation_func(intermediate_parallel)
+            if self.config.gated_linear_unit:
 
-        infused_adapter = self.get_adapter_module(AdapterName.MLP_INFUSED)
-        if infused_adapter:
-            intermediate_parallel = infused_adapter(intermediate_parallel)
+                def glu(x):
+                    x = torch.chunk(x, 2, dim=-1)
+                    return self.config.activation_func(x[0]) * x[1]
+
+                intermediate_parallel = glu(intermediate_parallel)
+            else:
+                intermediate_parallel = self.activation_func(intermediate_parallel)
 
         # [s, b, h]
         output, output_bias = self.linear_fc2(intermediate_parallel)
+
+        # LoRA logic
+        if self.is_adapter_available():
+            lora_linear_fc2_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER)
+            if lora_linear_fc2_adapter:
+                lora_output = lora_linear_fc2_adapter(intermediate_parallel)
+                output = output + lora_output
         return output, output_bias
 
 
@@ -204,6 +327,7 @@ def forward(
         context_mask=None,
         rotary_pos_emb=None,
         inference_params=None,
+        packed_seq_params=None,
     ):
         # hidden_states: [s, b, h]
 
@@ -219,6 +343,7 @@ def forward(
             attention_mask=attention_mask,
             inference_params=inference_params,
             rotary_pos_emb=rotary_pos_emb,
+            packed_seq_param=packed_seq_params,
         )
 
         # adapter logic
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index d97f73fb1dde..d57d40b5c581 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -67,6 +67,10 @@ class AdapterName(str, enum.Enum):
     LORA_KQV_ADAPTER = "lora_kqv_adapter"
     LORA_KV_ADAPTER = "lora_kv_adapter"
     LORA_Q_ADAPTER = "lora_q_adapter"
+    MM_LINEAR_ADAPTER = "mm_linear_adapter"
+    LORA_DENSE_ATTENTION_ADAPTER = "lora_dense_attention_adapter"
+    LORA_Hto4H_ADAPTER = "lora_hto4h_adapter"
+    LORA_4HtoH_ADAPTER = "lora_4htoh_adapter"
     MULTIMODAL_PROJECTOR_ADAPTER = "mm_projector_adapter"
     PARALLEL_LINEAR_ADAPTER = "parallel_linear_adapter"
 
@@ -128,6 +132,7 @@ def __init__(
         column_init_method: str = 'xavier',  # TODO: (@adithyare) should rename this to input_init_method to be more precise.
         row_init_method: str = 'zero',  # TODO: (@adithyare) should rename this to output_init_method to be more precise.
         gather_output: bool = True,
+        input_is_parallel: bool = False,  # NOTE: (@ertkonuk) we need this for LoRA adapters that are applied to RowParallelLinear layers
         dropout: float = 0.0,
         model_parallel_config: Optional[ModelParallelConfig] = None,
         **kwargs,
@@ -148,14 +153,25 @@ def __init__(
         if model_parallel_config is None:
             model_parallel_config = ModelParallelConfig()
 
-        self.linear_in = ColumnParallelLinear(
-            in_features,
-            dim,
-            config=model_parallel_config,
-            bias=False,
-            gather_output=True,
-            init_method=self._get_init_fn(column_init_method),
-        )
+        if input_is_parallel:
+            self.linear_in = RowParallelLinear(
+                in_features,
+                dim,
+                config=model_parallel_config,
+                input_is_parallel=True,
+                skip_bias_add=True,
+                bias=False,
+                init_method=self._get_init_fn(column_init_method),
+            )
+        else:
+            self.linear_in = ColumnParallelLinear(
+                in_features,
+                dim,
+                config=model_parallel_config,
+                bias=False,
+                gather_output=True,
+                init_method=self._get_init_fn(column_init_method),
+            )
         if gather_output:
             self.linear_out = RowParallelLinear(
                 dim,
@@ -174,7 +190,7 @@ def __init__(
                 out_features,
                 config=model_parallel_config,
                 bias=False,
-                gather_output=False,
+                gather_output=True if input_is_parallel else False,
                 init_method=self._get_init_fn(row_init_method),
             )
 
@@ -249,6 +265,7 @@ class ParallelLinearAdapterConfig(AdapterConfig):
     column_init_method: str = 'xavier'
     row_init_method: str = 'zero'
     gather_output: bool = True
+    input_is_parallel: bool = False
     dropout: float = 0.0
     network_alpha: int | None = None
     _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__)
@@ -281,6 +298,33 @@ class LoraQAdapter(ParallelLinearAdapter):
     pass
 
 
+class LoraDenseAttentionAdapter(ParallelLinearAdapter):
+    """
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    and they do not use an bottleneck activation function
+    """
+
+    pass
+
+
+class LoraHto4HAdapter(ParallelLinearAdapter):
+    """
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    and they do not use an bottleneck activation function
+    """
+
+    pass
+
+
+class Lora4HtoHAdapter(ParallelLinearAdapter):
+    """
+    Lora Adapters are the same arch as regular adapters but with potentially different input and output feature sizes 
+    and they do not use an bottleneck activation function
+    """
+
+    pass
+
+
 @dataclass
 class LoraKQVAdapterConfig(ParallelLinearAdapterConfig):
     _target_: str = "{0}.{1}".format(LoraKQVAdapter.__module__, LoraKQVAdapter.__name__)
@@ -296,6 +340,23 @@ class LoraKVAdapterConfig(ParallelLinearAdapterConfig):
     _target_: str = "{0}.{1}".format(LoraKVAdapter.__module__, LoraKVAdapter.__name__)
 
 
+@dataclass
+class LoraDenseAttentionAdapterConfig(ParallelLinearAdapterConfig):
+    _target_: str = "{0}.{1}".format(LoraDenseAttentionAdapter.__module__, LoraDenseAttentionAdapter.__name__)
+    input_is_parallel: bool = True
+
+
+@dataclass
+class LoraHto4HAdapterConfig(ParallelLinearAdapterConfig):
+    _target_: str = "{0}.{1}".format(LoraHto4HAdapter.__module__, LoraHto4HAdapter.__name__)
+
+
+@dataclass
+class Lora4HtoHAdapterConfig(ParallelLinearAdapterConfig):
+    _target_: str = "{0}.{1}".format(Lora4HtoHAdapter.__module__, Lora4HtoHAdapter.__name__)
+    input_is_parallel: bool = True
+
+
 class PromptEncoderAdapter(nn.Module, AdapterModuleUtil):
     """
     The Tensor Parallel MLP prompt encoder network that is used to generate the virtual
diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py
index 38ee587e5ca5..64e62fb81937 100644
--- a/nemo/collections/nlp/modules/common/megatron/attention.py
+++ b/nemo/collections/nlp/modules/common/megatron/attention.py
@@ -21,6 +21,7 @@
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
     InfusedAdapterConfig,
+    LoraDenseAttentionAdapterConfig,
     LoraKQVAdapterConfig,
     LoraKQVAdapterWeightTyingConfig,
     LoraKVAdapterConfig,
@@ -172,6 +173,7 @@ def __init__(
                 LoraQAdapterConfig._target_,
                 LoraKVAdapterConfig._target_,
                 LoraKQVAdapterWeightTyingConfig._target_,
+                LoraDenseAttentionAdapterConfig._target_,
             ]
         )
 
@@ -570,6 +572,11 @@ def forward(
         # =================
 
         output, bias = self.dense(context_layer)
+        if self.is_adapter_available():
+            lora_dense_adapter = self.get_adapter_module(AdapterName.LORA_DENSE_ATTENTION_ADAPTER)
+            if lora_dense_adapter:
+                lora_dense_output = lora_dense_adapter(context_layer)
+                output = output + lora_dense_output
 
         if get_key_value:
             output = [output, present]
diff --git a/nemo/collections/nlp/modules/common/megatron/mlp.py b/nemo/collections/nlp/modules/common/megatron/mlp.py
index fd7bb5a7a702..aae86c54c1c4 100644
--- a/nemo/collections/nlp/modules/common/megatron/mlp.py
+++ b/nemo/collections/nlp/modules/common/megatron/mlp.py
@@ -17,6 +17,8 @@
 
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
+    Lora4HtoHAdapterConfig,
+    LoraHto4HAdapterConfig,
     MLPInfusedAdapterConfig,
 )
 from nemo.collections.nlp.modules.common.megatron.fused_bias_geglu import fused_bias_geglu
@@ -93,7 +95,9 @@ def __init__(
         self.activation = activation
         self.dropout = dropout
         self.dtype = dtype
-        self.set_accepted_adapter_types([MLPInfusedAdapterConfig._target_])
+        self.set_accepted_adapter_types(
+            [LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_]
+        )
 
         supported_activations = [
             'gelu',
@@ -216,6 +220,11 @@ def forward(self, hidden_states):
 
         # [s, b, 4hp]
         intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+        if self.is_adapter_available():
+            lora_dense_h_to_4h_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
+            if lora_dense_h_to_4h_adapter:
+                lora_intermediate_parallel = lora_dense_h_to_4h_adapter(hidden_states)
+                intermediate_parallel = intermediate_parallel + lora_intermediate_parallel
 
         if self.fast_glu_activation:
             intermediate_parallel, intermediate_parallel_2 = torch.chunk(intermediate_parallel, 2, dim=-1)
@@ -259,6 +268,11 @@ def forward(self, hidden_states):
 
         # [s, b, h]
         output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        if self.is_adapter_available():
+            lora_dense_4h_to_h_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER)
+            if lora_dense_4h_to_h_adapter:
+                lora_output = lora_dense_4h_to_h_adapter(intermediate_parallel)
+                output = output + lora_output
         return output, output_bias
 
 
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 72bcdf55e8ae..1d365723ebda 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -29,6 +29,9 @@
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import (
     AdapterName,
     InfusedAdapterConfig,
+    Lora4HtoHAdapterConfig,
+    LoraDenseAttentionAdapterConfig,
+    LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
     LoraKQVAdapterWeightTyingConfig,
     MLPInfusedAdapterConfig,
@@ -37,6 +40,47 @@
     PromptEncoderAdapterConfig,
 )
 
+PEFT_MODULE_MAP = {
+    "qkv_module": "attention_qkv",
+    "dense_module": "attention_dense",
+    "hto4h_module": "mlp_fc1",
+    "4htoh_module": "mlp_fc2",
+    "attention": "attention",
+    "mlp": "mlp",
+    "all": "all",
+}
+
+
+def get_target_modules(lora_cfg):
+    original_target_modules = lora_cfg.get("target_modules", ["attention_qkv"])
+    target_modules = []
+
+    for module in original_target_modules:
+        if module == PEFT_MODULE_MAP["attention"]:
+            if PEFT_MODULE_MAP['qkv_module'] not in target_modules:
+                target_modules.append(PEFT_MODULE_MAP['qkv_module'])
+            if PEFT_MODULE_MAP['dense_module'] not in target_modules:
+                target_modules.append(PEFT_MODULE_MAP['dense_module'])
+        elif module == PEFT_MODULE_MAP["mlp"]:
+            if PEFT_MODULE_MAP['hto4h_module'] not in target_modules:
+                target_modules.append(PEFT_MODULE_MAP['hto4h_module'])
+            if PEFT_MODULE_MAP['4htoh_module'] not in target_modules:
+                target_modules.append(PEFT_MODULE_MAP['4htoh_module'])
+        elif module == PEFT_MODULE_MAP["all"]:
+            for sub_module in [
+                PEFT_MODULE_MAP['qkv_module'],
+                PEFT_MODULE_MAP['dense_module'],
+                PEFT_MODULE_MAP['hto4h_module'],
+                PEFT_MODULE_MAP['4htoh_module'],
+            ]:
+                if sub_module not in target_modules:
+                    target_modules.append(sub_module)
+        else:
+            if module not in target_modules:
+                target_modules.append(module)
+
+    return target_modules
+
 
 class PEFTConfig:
     # superclass for adapter name and config
@@ -62,6 +106,53 @@ def __init__(self, cfg):
 class LoraPEFTConfig(PEFTConfig):
     def __init__(self, cfg):
         lora_cfg = cfg.peft.lora_tuning
+        kv_channels = self._calculate_kv_channels(cfg)
+        projection_size = kv_channels * cfg.num_attention_heads
+        num_query_groups = cfg.get("num_query_groups", cfg.num_attention_heads)
+
+        qkv_projection_size = projection_size + (2 * kv_channels * num_query_groups)
+
+        fast_glu_activation = cfg.get('activation', 'gelu') in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
+
+        target_modules = get_target_modules(lora_cfg)
+        name_key_to_cfg = {}
+        name_key_to_mcore_mixins = {}
+
+        for module in target_modules:
+            if module == PEFT_MODULE_MAP["qkv_module"]:
+                adapter_cfg = self._create_lora_config(
+                    cfg, lora_cfg, cfg.hidden_size, qkv_projection_size, LoraKQVAdapterConfig
+                )
+                name_key_to_cfg[AdapterName.LORA_KQV_ADAPTER] = adapter_cfg
+                name_key_to_mcore_mixins[AdapterName.LORA_KQV_ADAPTER] = [("self_attention", MCoreSelfAttentionMixin)]
+
+            elif module == PEFT_MODULE_MAP["dense_module"]:
+                adapter_cfg = self._create_lora_config(
+                    cfg, lora_cfg, cfg.hidden_size, cfg.hidden_size, LoraDenseAttentionAdapterConfig
+                )
+                name_key_to_cfg[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = adapter_cfg
+                name_key_to_mcore_mixins[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = [
+                    ("self_attention", MCoreSelfAttentionMixin)
+                ]
+
+            elif module == PEFT_MODULE_MAP["hto4h_module"]:
+                hto4h_projection_size = cfg.ffn_hidden_size * 2 if fast_glu_activation else cfg.ffn_hidden_size
+                adapter_cfg = self._create_lora_config(
+                    cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, LoraHto4HAdapterConfig
+                )
+                name_key_to_cfg[AdapterName.LORA_Hto4H_ADAPTER] = adapter_cfg
+                name_key_to_mcore_mixins[AdapterName.LORA_Hto4H_ADAPTER] = [("mlp", MCoreMLPMixin)]
+            elif module == PEFT_MODULE_MAP["4htoh_module"]:
+                adapter_cfg = self._create_lora_config(
+                    cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, Lora4HtoHAdapterConfig
+                )
+                name_key_to_cfg[AdapterName.LORA_4HtoH_ADAPTER] = adapter_cfg
+                name_key_to_mcore_mixins[AdapterName.LORA_4HtoH_ADAPTER] = [("mlp", MCoreMLPMixin)]
+
+        self.name_key_to_mcore_mixins = name_key_to_mcore_mixins
+        super().__init__(lora_cfg, name_key_to_cfg)
+
+    def _calculate_kv_channels(self, cfg):
         if cfg.get("kv_channels", None) is None:
             assert (
                 cfg.hidden_size % cfg.num_attention_heads == 0
@@ -69,15 +160,12 @@ def __init__(self, cfg):
             kv_channels = cfg.hidden_size // cfg.num_attention_heads
         else:
             kv_channels = cfg.kv_channels
-        projection_size = kv_channels * cfg.num_attention_heads
-        num_query_groups = cfg.get("num_query_groups", None)
-        if num_query_groups is None:
-            num_query_groups = cfg.num_attention_heads
-        qkv_projection_size = projection_size + (2 * kv_channels * num_query_groups)
+        return kv_channels
 
+    def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_cfg_cls):
         config_args = {
-            "in_features": cfg.hidden_size,
-            "out_features": qkv_projection_size,
+            "in_features": in_features,
+            "out_features": out_features,
             "dim": lora_cfg.adapter_dim,
             "norm_position": None,
             "norm_type": None,
@@ -95,7 +183,7 @@ def __init__(self, cfg):
             elif position_embedding_strategy == "add":
                 dim_position_embeddings = cfg.hidden_size
             elif position_embedding_strategy == "biasadd":
-                dim_position_embeddings = 3 * projection_size
+                dim_position_embeddings = 3 * out_features
             elif position_embedding_strategy == "concat":
                 dim_position_embeddings = lora_cfg.adapter_dim
             elif position_embedding_strategy == "mlpconcat":
@@ -111,16 +199,10 @@ def __init__(self, cfg):
                     "position_embedding_strategy": position_embedding_strategy,
                 }
             )
-            adapter_cfg = LoraKQVAdapterWeightTyingConfig(**config_args)
-        else:
-            adapter_cfg = LoraKQVAdapterConfig(**config_args)
 
-        name_key_to_cfg = {
-            AdapterName.LORA_KQV_ADAPTER: adapter_cfg,
-        }
-        self.name_key_to_mcore_mixins = {AdapterName.LORA_KQV_ADAPTER: [("self_attention", MCoreSelfAttentionMixin)]}
+        adapter_cfg = adapter_cfg_cls(**config_args)
 
-        super().__init__(lora_cfg, name_key_to_cfg)
+        return adapter_cfg
 
 
 class IA3PEFTConfig(PEFTConfig):
diff --git a/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py b/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py
index 6cb0fa4c8b9f..f1e3d4e6ee1e 100644
--- a/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py
@@ -137,6 +137,7 @@ def get_new_key(old_key):
         "encoder_seq_length": hf_config.n_positions,
         "max_position_embeddings": hf_config.n_positions,
         "num_layers": hf_config.n_layer,
+        "cpu_offloading_num_layers": hf_config.n_layer - 1,  # @chcui temp workaround before m-lm !1124 is merged
         "num_attention_heads": hf_config.n_head,
         "ffn_hidden_size": hf_config.n_inner,
         "layernorm_epsilon": hf_config.layer_norm_epsilon,

From 223456a44802e5cf8dd783c0249fc517438082d2 Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Wed, 21 Feb 2024 14:56:21 -0800
Subject: [PATCH 25/28] Add Neva Template for NV-DPO Models  (#8358)

* add/rename from nvgpt to nv_steerlm, add nv_dpo template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add nv_dpo conversation to accomendate empty system message

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* handle nv_dpo template text generation

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* add prompt string to nvgpt

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bugfix for inference prompt template

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>

* bug fix for grabbing clean text

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* fix code format

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

---------

Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../multimodal/data/neva/conversation.py      |  13 +++
 .../multimodal/data/neva/neva_dataset.py      | 105 +++++++++++++++++-
 .../common/text_generation_strategy.py        |  12 +-
 .../modules/common/text_generation_utils.py   |   4 +-
 4 files changed, 129 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index 886049dd5170..d51a5f973f99 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -263,6 +263,17 @@ def dict(self):
     sep2=f"{DEFAULT_SYSTEM_TOKEN}System\n",
 )
 
+conv_nv_dpo = Conversation(
+    system="\n",
+    roles=("User", "Assistant"),
+    version="nv_dpo",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.NVGPT,
+    sep=DEFAULT_SEPARATOR_TOKEN,
+    sep2=f"{DEFAULT_SYSTEM_TOKEN}System\n",
+)
+
 conv_vicuna_v0 = Conversation(
     system="A chat between a curious human and an artificial intelligence assistant. "
     "The assistant gives helpful, detailed, and polite answers to the human's questions.",
@@ -400,6 +411,8 @@ def dict(self):
     "v1_mmtag": conv_llava_v1_mmtag,
     "llava_llama_2": conv_llava_llama_2,
     "nvgpt": conv_nvgpt,
+    "nv_steerlm": conv_nvgpt,
+    "nv_dpo": conv_nv_dpo,
 }
 
 
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 90f862869369..15d755a7d59a 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -381,6 +381,8 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
     - The function asserts that each message in a conversation alternates between the defined roles and skips messages not starting with the 'human' role.
     """
 
+    """<extra_id_0>System\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n<extra_id_1>User\n{user input}\n<extra_id_1>Assistant\n<extra_id_2>quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4\n"""
+
     conv = conversation_lib.conv_nvgpt.copy()
 
     # Apply prompt templates
@@ -462,6 +464,105 @@ def preprocess_nvgpt(sources: dict, tokenizer, cfg,) -> Dict:
     return dict(tokens=tokens, labels=labels,)
 
 
+def preprocess_nv_dpo(sources: dict, tokenizer, cfg,) -> Dict:
+    """
+    Preprocess a given set of conversational sources using nvgpt conversation template
+
+    This function processes conversations by first ensuring the conversation starts with a 'human' role, then tokenizes the conversations, applies specific token replacements, and finally masks labels for training purposes.
+
+    Parameters:
+    - sources: A dictionary containing conversational data. Expected format is a dict of conversations, where each conversation is a list of messages, and each message is a dict with 'from' (role) and 'value' (message text).
+    - tokenizer: A tokenizer from the Hugging Face Transformers library used for tokenizing the conversations.
+    - cfg: Configuration settings which include 'add_extra_token' (bool) to determine if an extra token should be added to the tokenized output, and 'context_length' for specifying the tokenization context length.
+
+    Returns:
+    - Dict: A dictionary containing two keys:
+        - 'tokens': A tensor of tokenized conversation data.
+        - 'labels': A tensor of labels for the conversation data, used for training models. Labels are masked based on the conversation structure.
+
+    Note:
+    - The function includes specific token replacements (e.g., DEFAULT_IMAGE_PATCH_TOKEN, <s>, </s>) and masking techniques for labels.
+    - It is designed to work with conversational data where messages alternate between a 'human' and a 'gpt' role.
+    - The function asserts that each message in a conversation alternates between the defined roles and skips messages not starting with the 'human' role.
+    """
+
+    """<extra_id_0>System\n\n<extra_id_1>User\n{user input}\n<extra_id_1>Assistant\n"""
+
+    conv = conversation_lib.conv_nv_dpo.copy()
+
+    # Apply prompt templates
+    conversations = []
+    for source in sources:
+        conv.messages = []
+        conv.system = source.get('system', conv.system)
+
+        strip_end_for_inference = False
+        for i, turn in enumerate(source['conversations']):
+
+            if i % 2 == 1:
+                turn['from'] = conv.roles[1]
+                conv.append_message(turn['from'], turn['value'])
+                if not turn["value"]:
+                    strip_end_for_inference = (
+                        True  # in inference, current turn is empty, thus end tokens need to striped.
+                    )
+            else:
+                turn['from'] = conv.roles[0]
+                conv.append_message(turn['from'], turn['value'])
+        context = conv.get_prompt()
+        if strip_end_for_inference:
+            if context.endswith("\n<extra_id_1>"):
+                context = context[: -len("\n<extra_id_1>")] + "\n"
+        conversations.append(context)
+
+    add_extra_token = cfg.get("add_extra_token")
+    # Tokenize conversations
+    tokens = tokenize(
+        texts=conversations,
+        tokenizer=tokenizer,
+        context_length=cfg.get("context_length"),
+        add_extra_token=add_extra_token,
+    )
+
+    labels = tokens.clone().detach()
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + "\n"
+    for conversation, target in zip(conversations, labels):
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx : conv_idx + 2]))  # user + gpt
+
+        cur_len = 0
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+
+            instruction_len = len(tokenizer.text_to_ids(parts[0] + sep))
+            round_len = len(tokenizer.text_to_ids(rou + conv.sep))
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+    # Check if masking working correctly
+    # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())])
+
+    if add_extra_token:
+        tokens = tokens[:, :-1].contiguous()
+        labels = labels[:, 1:].contiguous()
+    else:
+        labels = torch.roll(labels, shifts=-1, dims=-1)
+        labels[:, -1] = IGNORE_INDEX
+
+    return dict(tokens=tokens, labels=labels,)
+
+
 def preprocess_plain(sources, tokenizer, cfg,) -> Dict:
     """
     Preprocesses plain text sources (no template) for tokenization and label generation.
@@ -604,8 +705,10 @@ def expand2square(pil_img, background_color):
             images_tensors = torch.tensor([])
             sources = copy.deepcopy(sources)
 
-        if self.conv_template == "nvgpt":
+        if self.conv_template in ["nvgpt", "nv_steerlm"]:
             data_dict = preprocess_nvgpt(sources, self.tokenizer, self.multimodal_cfg,)
+        elif self.conv_template == "nv_dpo":
+            data_dict = preprocess_nv_dpo(sources, self.tokenizer, self.multimodal_cfg,)
         elif self.conv_template == "v1":
             data_dict = preprocess_v1(sources, self.tokenizer, self.multimodal_cfg,)
         elif self.conv_template == "llama_2":
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index fd68eef592fd..59452ce96f99 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -329,14 +329,17 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
         DEFAULT_IMAGE_TOKEN,
         preprocess_llama_2,
         preprocess_multimodal,
+        preprocess_nv_dpo,
         preprocess_nvgpt,
         preprocess_v1,
     )
 
     list_data_dict = []
-    if multimodal_cfg["conv_template"] == "nvgpt":
+    if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm", "nv_dpo"]:
         record = {
-            'system': 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n',
+            'system': '\n'
+            if multimodal_cfg["conv_template"] == 'nv_dpo'
+            else 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions.\n\n',
             'conversations': [{'from': 'User', 'value': prompt}, {'from': 'Assistant', 'value': '',},],
         }
 
@@ -348,7 +351,10 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
         sources = preprocess_multimodal(
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
-        data_dict = preprocess_nvgpt(sources, tokenizer, multimodal_cfg)
+        if multimodal_cfg["conv_template"] in ["nvgpt", "nv_steerlm"]:
+            data_dict = preprocess_nvgpt(sources, tokenizer, multimodal_cfg)
+        else:
+            data_dict = preprocess_nv_dpo(sources, tokenizer, multimodal_cfg)
 
     elif multimodal_cfg["conv_template"] == "llama_2":
         record = {
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index c6a8f1e46900..7946b846c7cd 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -181,7 +181,7 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
 
         clean_response = clean_text
 
-        if conv_template == "nvgpt":
+        if conv_template in ["nvgpt", "nv_steerlm"]:
             labels_str_regexp = re.compile(f"<extra_id_2>quality:.*\n")
             last_match_end_position = None
             for match in re.finditer(labels_str_regexp, clean_response):
@@ -189,6 +189,8 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             if last_match_end_position is not None:
                 clean_response = clean_response[last_match_end_position:]
             clean_response = clean_response.strip("<extra_id_1>")
+        elif conv_template == 'nv_dpo':
+            clean_response = clean_response.split("<extra_id_1>")[-2][10:]  # [10:] for removing "Assistant\n"
         elif conv_template == "llama_2":
             clean_response = clean_response.rsplit("[/INST] ", 1)[-1]
         elif conv_template == "v1":

From 25e02f48eb3b9defa28f69ddaea65e69cb04d1f9 Mon Sep 17 00:00:00 2001
From: Michal Futrega <mfutrega@nvidia.com>
Date: Thu, 22 Feb 2024 11:23:54 +0100
Subject: [PATCH 26/28] Rebase scaling alpha

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../modules/common/megatron/adapters/parallel_adapters.py    | 5 +++++
 nemo/collections/nlp/parts/peft_config.py                    | 1 +
 2 files changed, 6 insertions(+)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index d57d40b5c581..e1387c7d61b0 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -135,6 +135,7 @@ def __init__(
         input_is_parallel: bool = False,  # NOTE: (@ertkonuk) we need this for LoRA adapters that are applied to RowParallelLinear layers
         dropout: float = 0.0,
         model_parallel_config: Optional[ModelParallelConfig] = None,
+        alpha: float | None = None,
         **kwargs,
     ):
         super().__init__()
@@ -147,6 +148,7 @@ def __init__(
         self.activation = activation_registry[activation]()
         self.norm_position = norm_position
         self.dim = dim
+        self.alpha = alpha if alpha is not None else self.dim
 
         # megatron_gpt_peft_models will provide this arg, but deprecated ones do not.
         # in case this arg is not provided, use the dummy default config.
@@ -250,6 +252,8 @@ def forward(self, x):
         # Add dropout if available
         if self.dropout is not None:
             x = self.dropout(x)
+        
+        x = x * (self.alpha / self.dim)
 
         return x
 
@@ -267,6 +271,7 @@ class ParallelLinearAdapterConfig(AdapterConfig):
     gather_output: bool = True
     input_is_parallel: bool = False
     dropout: float = 0.0
+    alpha: float | None = None
     network_alpha: int | None = None
     _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__)
 
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 1d365723ebda..a1c392fb7e82 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -174,6 +174,7 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_
             "row_init_method": lora_cfg.get("row_init_method", "zero"),
             "gather_output": False,
             "dropout": lora_cfg.adapter_dropout,
+            "alpha": lora_cfg.get("alpha", lora_cfg.adapter_dim),
         }
 
         if lora_cfg.weight_tying:

From 6d3ad7b679b274727959c20d5c8ea5ab3d794724 Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Thu, 25 Jan 2024 22:26:21 +0000
Subject: [PATCH 27/28] default for alpha

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../tuning/conf/megatron_gpt_finetuning_config.yaml              | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
index af561ffe0aad..96752696da41 100644
--- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_finetuning_config.yaml
@@ -96,6 +96,7 @@ model:
     lora_tuning:
       target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
       adapter_dim: 32
+      alpha: ${model.peft.lora_tuning.adapter_dim} 
       adapter_dropout: 0.0
       column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
       row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal

From 1ec1892b74d10e1309a78a58d6747b7fe110eb51 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jan 2024 22:28:53 +0000
Subject: [PATCH 28/28] Rebase scaling alpha

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
---
 .../nlp/modules/common/megatron/adapters/parallel_adapters.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index e1387c7d61b0..4c9298ab53a2 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -252,7 +252,7 @@ def forward(self, x):
         # Add dropout if available
         if self.dropout is not None:
             x = self.dropout(x)
-        
+
         x = x * (self.alpha / self.dim)
 
         return x