From 27de8458bbfe77258235d077eb55cb68e7701d59 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Tue, 11 Jun 2024 01:02:26 +0300
Subject: [PATCH 01/25] cherry pick of #9266 (#9411)

* add deprecation warnings for non-mcore models

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change warning default time

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unused import

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* remove deprecated tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* set mcore_gpt to True

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* set mcore_bert to True

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove deprecated tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove deprecated unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add deprecation warning

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* remove deprecated playbook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove deprecated tutorial

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* turn off FA for Bert

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* turn of FA for Bert

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* change mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* adjustments

* update TE commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix mcore precision issue

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* change precision for bert

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* change precision for fine-tuning

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* turn off fused attention for bert

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix typo

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unnecessary

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
---
 .github/workflows/cicd-main.yml               | 2065 ++++++-----------
 .../conf/megatron_bert_config.yaml            |    8 +-
 .../conf/megatron_gpt_config.yaml             |    6 +-
 .../assistant_data_processor.py               |   19 +-
 .../dialogue/data_processor/data_processor.py |    8 +-
 .../data_processor/design_data_processor.py   |    6 +-
 .../mellon_qa_data_processor.py               |   15 +-
 .../data_processor/ms_marco_data_processor.py |   12 +-
 .../data_processor/sgd_data_processor.py      |   34 +-
 .../dialogue/dataset/dialogue_bert_dataset.py |   15 +-
 .../dialogue_gpt_classification_dataset.py    |   15 +-
 .../dialogue_gpt_generation_dataset.py        |   15 +-
 .../dialogue_nearest_neighbour_dataset.py     |    4 +
 .../dialogue_s2s_generation_dataset.py        |   15 +-
 .../dialogue_zero_shot_intent_dataset.py      |   21 +-
 .../megatron/base_prompt_learning_dataset.py  |   20 +-
 .../megatron/gpt_prompt_learning_dataset.py   |   32 +-
 .../dataset/qa_bert_dataset.py                |   14 +-
 .../question_answering/dataset/qa_dataset.py  |   32 +-
 .../dataset/qa_gpt_dataset.py                 |   21 +-
 .../dataset/qa_s2s_dataset.py                 |   35 +-
 .../question_answering_squad/qa_dataset.py    |   24 +-
 .../bert_example.py                           |  104 +-
 .../dialogue_gpt_classification_model.py      |   26 +-
 .../dialogue/dialogue_gpt_generation_model.py |   19 +-
 .../dialogue_nearest_neighbour_model.py       |   11 +-
 .../dialogue/dialogue_s2s_generation_model.py |   14 +-
 .../dialogue_zero_shot_intent_model.py        |   10 +-
 .../intent_slot_classification_model.py       |   15 +-
 .../nlp/models/dialogue/sgdqa_model.py        |   16 +-
 .../entity_linking/entity_linking_model.py    |    6 +-
 .../glue_benchmark/glue_benchmark_model.py    |    3 +
 .../megatron/bert/bert_model.py               |   22 +-
 .../language_modeling/megatron/gpt_model.py   |   16 +-
 .../megatron_base_prompt_learning_model.py    |    4 +
 .../megatron_gpt_prompt_learning_model.py     |   65 +-
 .../question_answering/qa_base_model.py       |   11 +-
 .../question_answering/qa_bert_model.py       |   32 +-
 .../models/question_answering/qa_gpt_model.py |   34 +-
 .../nlp/models/question_answering/qa_model.py |    6 +-
 .../models/question_answering/qa_s2s_model.py |   44 +-
 .../spellchecking_model.py                    |   11 +-
 nemo/utils/decorators/__init__.py             |    2 +-
 nemo/utils/decorators/deprecated.py           |   39 +-
 tests/collections/nlp/test_dialogue.py        |  278 ---
 .../nlp/test_entity_linking_model.py          |   84 -
 tests/collections/nlp/test_megatron.py        |   81 -
 tests/collections/nlp/test_mem_map_dataset.py |  133 --
 tests/collections/nlp/test_prompt_learning.py |  142 --
 tests/collections/nlp/test_qna.py             |  240 --
 .../nlp/test_question_answering.py            |  185 --
 .../test_spellchecking_asr_customization.py   | 1102 ---------
 tutorials/nlp/Dialogue.ipynb                  |  717 ------
 tutorials/nlp/Entity_Linking_Medical.ipynb    |  632 -----
 tutorials/nlp/GLUE_Benchmark.ipynb            |  566 -----
 tutorials/nlp/MegatronBert_export.ipynb       |  280 ---
 tutorials/nlp/Question_Answering.ipynb        | 1163 ----------
 ...pellMapper_English_ASR_Customization.ipynb | 1412 -----------
 58 files changed, 1252 insertions(+), 8709 deletions(-)
 delete mode 100644 tests/collections/nlp/test_dialogue.py
 delete mode 100644 tests/collections/nlp/test_entity_linking_model.py
 delete mode 100644 tests/collections/nlp/test_megatron.py
 delete mode 100644 tests/collections/nlp/test_mem_map_dataset.py
 delete mode 100644 tests/collections/nlp/test_prompt_learning.py
 delete mode 100644 tests/collections/nlp/test_qna.py
 delete mode 100644 tests/collections/nlp/test_question_answering.py
 delete mode 100644 tests/collections/nlp/test_spellchecking_asr_customization.py
 delete mode 100644 tutorials/nlp/Dialogue.ipynb
 delete mode 100644 tutorials/nlp/Entity_Linking_Medical.ipynb
 delete mode 100644 tutorials/nlp/GLUE_Benchmark.ipynb
 delete mode 100644 tutorials/nlp/MegatronBert_export.ipynb
 delete mode 100644 tutorials/nlp/Question_Answering.ipynb
 delete mode 100644 tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 12b8cdcb8eed..01a8cfc4b0df 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -871,318 +871,6 @@ jobs:
                 pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
                 output_manifest=preds.json
 
-  # L2: Dialogue Classification
-
-  # TODO: pleasefixme
-  # L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure-gpus-1
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           cd examples/nlp/dialogue && \
-  #           python dialogue.py \
-  #           model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-  #           model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
-  #           model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-  #           model.dataset.dialogues_example_dir=sgd_gen_outputs \
-  #           model.dataset.task_name=debug_sample \
-  #           trainer.max_steps=1 \
-  #           trainer.max_epochs=1 \
-  #           model.train_ds.batch_size=2 \
-  #           model.validation_ds.batch_size=2 \
-  #           model.test_ds.batch_size=2 \
-  #           model.nemo_path=null \
-  #           trainer.val_check_interval=0.0 \
-  #           trainer.devices=1 \
-  #           model.dataset.use_cache=false \
-  #           model.tokenizer.special_tokens={pad_token:"endoftext"} \
-  #           model.tokenizer.tokenizer_name=gpt2 \
-  #           model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
-  #           model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
-  #           trainer.accelerator=gpu \
-  #           exp_manager=null  && \
-  #           rm -rf sgd_gen_outputs
-
-  L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-        model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
-        model.dataset.task_name=debug_sample \
-        trainer.max_steps=1 \
-        trainer.max_epochs=1 \
-        model.train_ds.batch_size=2 \
-        model.validation_ds.batch_size=2 \
-        model.test_ds.batch_size=2 \
-        model.dataset.num_tasks=6 \
-        model.nemo_path=null \
-        trainer.val_check_interval=0.0 \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=bert-base-cased \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf sgd_gen_bert_outputs
-
-  L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        model.dataset.data_dir=/home/TestData/nlp/processed_assistant \
-        model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \
-        model.dataset.task=assistant \
-        trainer.max_steps=1 \
-        trainer.max_epochs=1 \
-        model.train_ds.batch_size=2 \
-        model.validation_ds.batch_size=2 \
-        model.test_ds.batch_size=2 \
-        model.nemo_path=null \
-        trainer.val_check_interval=0.0 \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=bert-base-uncased \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf sgd_gen_bert_intent_classification_outputs
-
-  L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        do_training=False \
-        model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \
-        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-        model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \
-        model.dataset.task=zero_shot \
-        model.dataset.prompt_template="This example is" \
-        trainer.max_steps=1 \
-        trainer.max_epochs=1 \
-        model.train_ds.batch_size=2 \
-        model.validation_ds.batch_size=2 \
-        model.test_ds.batch_size=2 \
-        model.nemo_path=null \
-        trainer.val_check_interval=0.0 \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=bert-base-uncased \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf sgd_gen_zero_shot_intent_classification_outputs
-
-  L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        do_training=False \
-        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-        model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \
-        model.dataset.task=design \
-        model.dataset.prompt_template="This example is related to" \
-        model.library=megatron \
-        trainer.max_steps=1 \
-        trainer.max_epochs=1 \
-        model.train_ds.batch_size=2 \
-        model.validation_ds.batch_size=2 \
-        model.test_ds.batch_size=2 \
-        model.nemo_path=null \
-        trainer.val_check_interval=0.0 \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=bert-base-uncased \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf design_zero_shot_intent_classification_outputs
-
-  L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        do_training=False \
-        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-        model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \
-        model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \
-        model.dataset.task=design \
-        model.dataset.prompt_template="This example is related to" \
-        model.library=huggingface \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=bert-base-uncased \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf design_zero_shot_intent_classification_bart_outputs
-
-  L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        do_training=False \
-        model.dataset.data_dir=/home/TestData/nlp/design_dataset \
-        model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \
-        model.dataset.task=design \
-        model.dataset.prompt_template="" \
-        model.library=huggingface \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf design_dialogue_nearest_neighbour_classification_outputs
-
-  # L2: Dialogue Generation
-  L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        do_training=False \
-        model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-        model.dataset.dialogues_example_dir=answer_extender_s2s \
-        model.dataset.task=ms_marco \
-        model.library=huggingface \
-        model.dataset.debug_mode=True \
-        trainer.max_steps=1 \
-        trainer.max_epochs=1 \
-        model.train_ds.batch_size=2 \
-        model.validation_ds.batch_size=2 \
-        model.test_ds.batch_size=2 \
-        model.nemo_path=null \
-        trainer.val_check_interval=0.0 \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=facebook/bart-large \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf answer_extender_s2s
-
-  L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        do_training=False \
-        model.dataset.data_dir=/home/TestData/nlp/sgd_small \
-        model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \
-        model.dataset.task_name=debug_sample \
-        model.dataset.task=sgd_generation \
-        model.dataset.input_field=utterance+system_actions \
-        model.dataset.output_field=system_utterance \
-        model.dataset.use_cache=false \
-        model.dataset.system_utterance=next_turn \
-        model.dataset.debug_mode=True \
-        model.dataset.prompt_template=slots_values \
-        model.library=huggingface \
-        trainer.max_steps=1 \
-        trainer.max_epochs=1 \
-        model.train_ds.batch_size=2 \
-        model.validation_ds.batch_size=2 \
-        model.test_ds.batch_size=2 \
-        model.nemo_path=null \
-        trainer.val_check_interval=0.0 \
-        trainer.devices=1 \
-        model.language_model.pretrained_model_name=facebook/bart-large \
-        trainer.accelerator=gpu \
-        exp_manager=null
-      AFTER_SCRIPT: |
-        rm -rf sgd_answer_extender_s2s
-
-#     - name: L2: Dialogue Generation Part 2
-#       when {
-#         anyOf {
-#           branch main
-#           changeRequest target: main
-#         }
-#       }
-#       failFast true
-#       parallel {
-#         - name: Dialogue: Answer Extender using DialogueGPTGenerationModel
-#           - run: |
-#             cd examples/nlp/dialogue && \
-#             python dialogue.py \
-#             do_training=False \
-#             model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-#             model.dataset.dialogues_example_dir=answer_extender \
-#             model.library=huggingface \
-#             model.dataset.task=ms_marco \
-#             model.dataset.debug_mode=True \
-#             trainer.val_check_interval=0.0 \
-#             trainer.devices=1 \
-#             model.dataset.use_cache=false \
-#             model.language_model.pretrained_model_name=gpt2 \
-#             trainer.accelerator=gpu \
-#             exp_manager=null  && \
-#             rm -rf answer_extender
-#           }
-#         }
-#       }
-#     }
-
-  # L2: COPY
-  L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/dialogue && \
-        python dialogue.py \
-        do_training=False \
-        model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \
-        model.dataset.dialogues_example_dir=answer_extender \
-        model.library=huggingface \
-        model.dataset.task=ms_marco \
-        model.dataset.debug_mode=True \
-        trainer.val_check_interval=0.0 \
-        trainer.devices=1 \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name=gpt2 \
-        trainer.accelerator=gpu \
-        exp_manager=null  && \
-        rm -rf answer_extender
-
   # L2: Duplex Text Normalization
   L2_Duplex_Text_Normalization_with_Tarred_dataset:
     needs: [cicd-test-container-setup]
@@ -1212,216 +900,6 @@ jobs:
         data.test_ds.use_cache=false \
         data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
 
-# Runs out of memory on the 12G TITAN V (GPU 0 on main CI)
-# TODO: add when megatron bert is supported again in NeMo
-# - name: L2: MegaBERT Token Classification
-#   when {
-#     anyOf {
-#       branch main
-#       changeRequest target: main
-#     }
-#   }
-#   failFast true
-#   - run: |
-#     cd examples/nlp/token_classification && \
-#     python token_classification_train.py \
-#     model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-#     model.language_model.pretrained_model_name=megatron-bert-345m-uncased \
-#     model.train_ds.batch_size=10 \
-#     model.dataset.max_seq_length=50 \
-#     model.dataset.use_cache=false \
-#     trainer.accelerator=gpu \
-#     trainer.strategy=ddp \
-#     trainer.precision=16 \
-#     trainer.devices=1 \
-#     trainer.accelerator="gpu" \
-#     +trainer.fast_dev_run=true \
-#     exp_manager=null
-#   }
-# }
-
-  # L2: BERT Text Classification
-  L2_BERT_Text_Classification_with_BERT_Test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/text_classification && \
-        python text_classification_with_bert.py \
-        model.dataset.num_classes=6 \
-        model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-        model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-        model.language_model.pretrained_model_name=distilbert-base-uncased \
-        model.train_ds.batch_size=10 \
-        model.dataset.max_seq_length=50 \
-        model.dataset.use_cache=false \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        exp_manager=null
-
-  # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
-  L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        # Cannot do fast_dev_run because squad needs whole dev dataset
-        cd examples/nlp/question_answering && \
-        python question_answering.py \
-        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-        model.dataset.use_cache=false \
-        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-        model.train_ds.batch_size=2 \
-        model.train_ds.num_samples=2 \
-        model.validation_ds.batch_size=2 \
-        model.validation_ds.num_samples=2 \
-        model.test_ds.num_samples=2 \
-        model.test_ds.batch_size=2 \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        model.language_model.pretrained_model_name=bert-base-uncased \
-        model.dataset.version_2_with_negative=false \
-        trainer.precision=16 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        exp_manager=null
-
-  L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        # Cannot do fast_dev_run because squad needs whole dev dataset
-        cd examples/nlp/question_answering && \
-        python question_answering.py \
-        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-        model.dataset.use_cache=false \
-        model.train_ds.batch_size=2 \
-        model.train_ds.num_samples=2 \
-        model.validation_ds.batch_size=2 \
-        model.validation_ds.num_samples=2 \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-        model.language_model.pretrained_model_name=bert-base-uncased \
-        model.dataset.version_2_with_negative=true \
-        trainer.precision=16 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        exp_manager=null
-
-  # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
-  L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/question_answering && \
-        python question_answering.py \
-        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-        model.dataset.use_cache=false \
-        model.dataset.check_if_answer_in_context=false \
-        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-        model.train_ds.batch_size=2 \
-        model.train_ds.num_samples=2 \
-        model.validation_ds.batch_size=2 \
-        model.validation_ds.num_samples=2 \
-        model.test_ds.num_samples=2 \
-        model.test_ds.batch_size=2 \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        model.language_model.pretrained_model_name=facebook/bart-base \
-        model.dataset.version_2_with_negative=false \
-        trainer.precision=16 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        exp_manager=null
-
-  L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/question_answering && \
-        python question_answering.py \
-        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-        model.dataset.use_cache=false \
-        model.dataset.check_if_answer_in_context=false \
-        model.train_ds.batch_size=2 \
-        model.train_ds.num_samples=2 \
-        model.validation_ds.batch_size=2 \
-        model.validation_ds.num_samples=2 \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-        model.language_model.pretrained_model_name=facebook/bart-base \
-        model.dataset.version_2_with_negative=true \
-        trainer.precision=16 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        exp_manager=null
-
-  # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
-  L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/question_answering && \
-        python question_answering.py \
-        model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \
-        model.dataset.use_cache=false \
-        model.dataset.check_if_answer_in_context=false \
-        model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-        model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \
-        model.train_ds.batch_size=2 \
-        model.train_ds.num_samples=2 \
-        model.validation_ds.batch_size=2 \
-        model.validation_ds.num_samples=2 \
-        model.test_ds.num_samples=2 \
-        model.test_ds.batch_size=2 \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        model.language_model.pretrained_model_name=gpt2 \
-        model.dataset.version_2_with_negative=false \
-        trainer.precision=16 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        exp_manager=null
-
-  L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/question_answering && \
-        python question_answering.py \
-        model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \
-        model.dataset.use_cache=false \
-        model.dataset.check_if_answer_in_context=false \
-        model.train_ds.batch_size=2 \
-        model.train_ds.num_samples=2 \
-        model.validation_ds.batch_size=2 \
-        model.validation_ds.num_samples=2 \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \
-        model.language_model.pretrained_model_name=gpt2 \
-        model.dataset.version_2_with_negative=true \
-        trainer.precision=16 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        exp_manager=null
 
   # L2: Intent and Slot Classification Tasks
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
@@ -1653,241 +1131,7 @@ jobs:
           pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo;
 
         rm -rf "${data_dir}"
-        
-
-  L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        output_dir="$(mktemp -d -p "$(pwd)")" && \
-        tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \
-        python punctuation_capitalization_train_evaluate.py \
-          model.train_ds.use_tarred_dataset=false \
-          model.train_ds.ds_item="${tmp_data_dir}" \
-          model.validation_ds.ds_item="${tmp_data_dir}" \
-          model.test_ds.ds_item="${tmp_data_dir}" \
-          model.language_model.pretrained_model_name=distilbert-base-uncased \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=[0,1] \
-          trainer.accelerator="gpu" \
-          trainer.strategy=ddp \
-          trainer.max_epochs=1 \
-          +exp_manager.explicit_log_dir="${output_dir}" \
-          +do_testing=true && \
-        tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \
-        mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \
-        rm -rf "${tmp_data_dir}" && \
-        python punctuation_capitalization_train_evaluate.py \
-          model.train_ds.use_tarred_dataset=false \
-          model.train_ds.ds_item="${tmp_data_dir_2}" \
-          model.validation_ds.ds_item="${tmp_data_dir_2}" \
-          model.test_ds.ds_item="${tmp_data_dir_2}" \
-          pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=[0,1] \
-          trainer.accelerator="gpu" \
-          trainer.strategy=ddp \
-          trainer.max_epochs=1 \
-          exp_manager=null;
 
-        rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \
-          "${tmp_data_dir_2}" \
-          "${output_dir}"
-
-  # Punctuation & Capitalization tarred dataset:
-  Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
-          /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \
-          "${data_dir}"/ && \
-        usual_data=${data_dir}/wmt_wiki_10000 && \
-        output_dir="$(mktemp -d -p "$(pwd)")" && \
-        tarred_data=${output_dir}/train_tarred && \
-        tokens_in_batch=2000 && \
-        max_seq_length=512 && \
-        lm_model=distilbert-base-uncased && \
-        python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \
-          --text ${usual_data}/input.txt \
-          --labels ${usual_data}/labels.txt \
-          --output_dir ${tarred_data} \
-          --tokens_in_batch ${tokens_in_batch} \
-          --max_seq_length 512 \
-          --lines_per_dataset_fragment 2000 \
-          --num_batches_per_tarfile 5 \
-          --tar_file_prefix punctuation_capitalization \
-          --tokenizer_name ${lm_model} \
-          --use_fast_tokenizer \
-          --pad_label O \
-          --n_jobs 3 && \
-        echo "Number of tarred files in dataset:" && \
-        ls ${tarred_data}/*.tar | wc -l && \
-        echo "Label id files in dataset:" && \
-        ls ${tarred_data}/*.csv && \
-        metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \
-        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-          model.validation_ds.ds_item="${data_dir}" \
-          model.test_ds.ds_item="${data_dir}" \
-          model.train_ds.ds_item=${tarred_data} \
-          model.language_model.pretrained_model_name=${lm_model} \
-          model.train_ds.use_tarred_dataset=true \
-          model.train_ds.tar_metadata_file=${metadata_file} \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=[0,1] \
-          trainer.accelerator="gpu" \
-          trainer.strategy=ddp \
-          trainer.max_epochs=1 \
-          +exp_manager.explicit_log_dir=${output_dir}/output;
-
-        rm -rf "${output_dir}" "${data_dir}"
-
-  # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model
-  Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        work_dir="$(mktemp -d -p "$(pwd)")" && \
-        label_vocab_dir="${work_dir}/labels" && \
-        mkdir -p ${label_vocab_dir} && \
-        data_dir="${work_dir}/data" && \
-        mkdir -p "${data_dir}" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-        output_dir="${work_dir}/output" && \
-        mkdir -p "${output_dir}" && \
-        punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
-        capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
-        printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
-        printf "O\nU\n" > "${capit_label_vocab}" && \
-        python punctuation_capitalization_train_evaluate.py \
-          model.train_ds.use_tarred_dataset=false \
-          model.train_ds.ds_item="${data_dir}" \
-          model.validation_ds.ds_item="${data_dir}" \
-          model.test_ds.ds_item="${data_dir}" \
-          model.language_model.pretrained_model_name=distilbert-base-uncased \
-          model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
-          model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
-          model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=[0,1] \
-          trainer.strategy=ddp \
-          trainer.max_epochs=1 \
-          +exp_manager.explicit_log_dir="${output_dir}" \
-          +do_testing=false && \
-        python punctuation_capitalization_train_evaluate.py \
-          +do_training=false \
-          +do_testing=true \
-          ~model.train_ds \
-          ~model.validation_ds \
-          model.test_ds.ds_item="${data_dir}" \
-          pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=[0,1] \
-          trainer.strategy=ddp \
-          trainer.max_epochs=1 \
-          exp_manager=null && \
-        rm -rf "${work_dir}"
-        
-  # TODO: pleasefixme
-  # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           cd examples/nlp/token_classification && \
-  #           work_dir="$(mktemp -d -p "$(pwd)")" && \
-  #           output_dir="${work_dir}/output" && \
-  #           mkdir -p "${output_dir}" && \
-  #           data_dir="${work_dir}/data" && \
-  #           mkdir -p "${data_dir}" && \
-  #           cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \
-  #           conf_name=punctuation_capitalization_config_with_ids && \
-  #           cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \
-  #           sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \
-  #             "${work_dir}/${conf_name}.yaml" && \
-  #           sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \
-  #             "${work_dir}/${conf_name}.yaml" && \
-  #           python punctuation_capitalization_train_evaluate.py \
-  #             --config-path "${work_dir}" \
-  #             --config-name "${conf_name}" \
-  #             model.train_ds.use_tarred_dataset=false \
-  #             model.train_ds.ds_item="${data_dir}" \
-  #             model.validation_ds.ds_item="${data_dir}" \
-  #             model.test_ds.ds_item="${data_dir}" \
-  #             model.language_model.pretrained_model_name=distilbert-base-uncased \
-  #             +model.train_ds.use_cache=false \
-  #             +model.validation_ds.use_cache=false \
-  #             +model.test_ds.use_cache=false \
-  #             trainer.devices=[0,1] \
-  #             trainer.strategy=ddp \
-  #             trainer.max_epochs=1 \
-  #             +exp_manager.explicit_log_dir="${output_dir}" \
-  #             +do_testing=false && \
-  #           python punctuation_capitalization_train_evaluate.py \
-  #             +do_training=false \
-  #             +do_testing=true \
-  #             ~model.train_ds \
-  #             ~model.validation_ds \
-  #             model.test_ds.ds_item="${data_dir}" \
-  #             pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \
-  #             +model.train_ds.use_cache=false \
-  #             +model.validation_ds.use_cache=false \
-  #             +model.test_ds.use_cache=false \
-  #             trainer.devices=[0,1] \
-  #             trainer.strategy=ddp \
-  #             trainer.max_epochs=1 \
-  #             exp_manager=null && \
-  #           rm -rf "${work_dir}"
-
-  # Punctuation & Capitalization inference      
-  Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        output_dir="$(mktemp -d -p "$(pwd)")" && \
-        python examples/nlp/token_classification/punctuate_capitalize_infer.py \
-          --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \
-          --output_text "${output_dir}/iwslt_inference_result.txt" \
-          --max_seq_length 92 \
-          --step 8 \
-          --margin 16 \
-          --pretrained_name punctuation_en_bert \
-          --batch_size 32;
-        rm -rf "${output_dir}"
 
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
   L2_Pretraining_BERT_pretraining_from_Text:
@@ -1947,23 +1191,6 @@ jobs:
 
             #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
 
-  # L2: Entity Linking        
-  L2_Entity_Linking_Self_Alignment_Pretraining_BERT:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/entity_linking && \
-        python self_alignment_pretraining.py \
-        project_dir=. \
-        trainer.val_check_interval=3 \
-        model.raw_data=None \
-        model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \
-        model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \
-        model.train_ds.batch_size=8 \
-        model.validation_ds.batch_size=8 \
-        exp_manager.exp_dir=null
 
   # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
   # is in the release container
@@ -2581,211 +1808,250 @@ jobs:
         
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=20 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
   L2_Megatron_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.sequence_parallel=True \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=20 \
+            trainer.precision=bf16 \
+            model.megatron_amp_O2=True \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            rm -rf examples/nlp/language_modeling/bert_pretrain_results
+            rm -rf examples/nlp/language_modeling/bert_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=32 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.mcore_bert=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=32 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.mcore_bert=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method='block' \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=10 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            model.mcore_bert=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.sequence_parallel=True \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method='block' \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=20 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.mcore_bert=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method='block' \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
+
+            rm -rf examples/nlp/language_modeling/bert_pretrain_results
+            rm -rf examples/nlp/language_modeling/bert_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
   L2_Megatron_RETRO_Pretraining_and_Resume_Training:
     needs: [cicd-test-container-setup]
@@ -3086,168 +2352,189 @@ jobs:
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-  L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=rope \
-        model.rotary_percentage=0.5 \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-    
-        #  commented out to save time on github ci @adithyare
-        # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        # trainer.devices=2 \
-        # trainer.accelerator=gpu \
-        # trainer.log_every_n_steps=1 \
-        # trainer.val_check_interval=2 \
-        # trainer.limit_val_batches=1 \
-        # trainer.accumulate_grad_batches=1 \
-        # trainer.max_steps=6 \
-        # trainer.precision=16 \
-        # trainer.gradient_clip_val=1.0 \
-        # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        # exp_manager.resume_if_exists=True \
-        # model.tensor_model_parallel_size=2 \
-        # model.optim.name=fused_adam \
-        # model.optim.lr=2e-4 \
-        # model.optim.sched.warmup_steps=2 \
-        # model.optim.sched.constant_steps=2 \
-        # model.optim.sched.min_lr=8e-5 \
-        # model.max_position_embeddings=128 \
-        # model.encoder_seq_length=128 \
-        # model.data.seq_length=128 \
-        # model.position_embedding_type=rope \
-        # model.rotary_percentage=0.5 \
-        # model.normalization=rmsnorm \
-        # model.bias=False \
-        # model.bias_activation_fusion=False \
-        # model.bias_dropout_add_fusion=False \
-        # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        # model.num_layers=8 \
-        # model.hidden_size=256 \
-        # model.num_attention_heads=8 \
-        # model.activations_checkpoint_method=block \
-        # model.activations_checkpoint_granularity=full \
-        # model.activations_checkpoint_num_layers=1 \
-        # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=6 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            exp_manager.resume_if_exists=True \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=2 \
+            model.optim.sched.constant_steps=2 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
+  L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+           trainer.devices=2 \
+           trainer.accelerator=gpu \
+           trainer.log_every_n_steps=1 \
+           trainer.val_check_interval=2 \
+           trainer.limit_val_batches=2 \
+           trainer.accumulate_grad_batches=1 \
+           trainer.max_steps=3 \
+           trainer.gradient_clip_val=1.0 \
+           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+           model.tensor_model_parallel_size=2 \
+           model.optim.name=fused_adam \
+           model.optim.lr=2e-4 \
+           model.optim.sched.warmup_steps=1 \
+           model.optim.sched.constant_steps=1 \
+           model.optim.sched.min_lr=8e-5 \
+           model.max_position_embeddings=128 \
+           model.encoder_seq_length=128 \
+           model.data.seq_length=128 \
+           model.position_embedding_type=rope \
+           model.rotary_percentage=0.5 \
+           model.bias=False \
+           model.bias_activation_fusion=False \
+           model.bias_dropout_add_fusion=False \
+           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+           model.num_layers=8 \
+           model.hidden_size=256 \
+           model.num_attention_heads=8 \
+           model.activations_checkpoint_method=block \
+           model.activations_checkpoint_granularity=full \
+           model.activations_checkpoint_num_layers=1 \
+           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            #  commented out to save time on github ci @adithyare
+            # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            # trainer.devices=2 \
+            # trainer.accelerator=gpu \
+            # trainer.log_every_n_steps=1 \
+            # trainer.val_check_interval=2 \
+            # trainer.limit_val_batches=1 \
+            # trainer.accumulate_grad_batches=1 \
+            # trainer.max_steps=6 \
+            # trainer.gradient_clip_val=1.0 \
+            # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            # exp_manager.resume_if_exists=True \
+            # model.tensor_model_parallel_size=2 \
+            # model.optim.name=fused_adam \
+            # model.optim.lr=2e-4 \
+            # model.optim.sched.warmup_steps=2 \
+            # model.optim.sched.constant_steps=2 \
+            # model.optim.sched.min_lr=8e-5 \
+            # model.max_position_embeddings=128 \
+            # model.encoder_seq_length=128 \
+            # model.data.seq_length=128 \
+            # model.position_embedding_type=rope \
+            # model.rotary_percentage=0.5 \
+            # model.normalization=rmsnorm \
+            # model.bias=False \
+            # model.bias_activation_fusion=False \
+            # model.bias_dropout_add_fusion=False \
+            # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            # model.num_layers=8 \
+            # model.hidden_size=256 \
+            # model.num_attention_heads=8 \
+            # model.activations_checkpoint_method=block \
+            # model.activations_checkpoint_granularity=full \
+            # model.activations_checkpoint_num_layers=1 \
+            # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+
+           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+           rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
     #  This test requires Ampere but some of the test GPUs are Volta
     #  Need to add a check for compute capability before uncommenting this test
@@ -3343,169 +2630,192 @@ jobs:
 
   L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=alibi \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-        # not testing resume functionality to save time on ci @adithyare
-        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        #trainer.devices=2 \
-        #trainer.accelerator=gpu \
-        #trainer.log_every_n_steps=1 \
-        #trainer.val_check_interval=2 \
-        #trainer.limit_val_batches=1 \
-        #trainer.accumulate_grad_batches=1 \
-        #trainer.max_steps=6 \
-        #trainer.precision=16 \
-        #trainer.gradient_clip_val=1.0 \
-        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        #exp_manager.resume_if_exists=True \
-        #model.tensor_model_parallel_size=2 \
-        #model.optim.name=fused_adam \
-        #model.optim.lr=2e-4 \
-        #model.optim.sched.warmup_steps=2 \
-        #model.optim.sched.constant_steps=2 \
-        #model.optim.sched.min_lr=8e-5 \
-        #model.max_position_embeddings=128 \
-        #model.encoder_seq_length=128 \
-        #model.data.seq_length=128 \
-        #model.position_embedding_type=alibi \
-        #model.normalization=rmsnorm \
-        #model.bias=False \
-        #model.bias_activation_fusion=False \
-        #model.bias_dropout_add_fusion=False \
-        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        #model.num_layers=8 \
-        #model.hidden_size=256 \
-        #model.num_attention_heads=8 \
-        #model.activations_checkpoint_method=block \
-        #model.activations_checkpoint_granularity=full \
-        #model.activations_checkpoint_num_layers=1 \
-        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.position_embedding_type=alibi \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+            # not testing resume functionality to save time on ci @adithyare
+            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            #trainer.devices=2 \
+            #trainer.accelerator=gpu \
+            #trainer.log_every_n_steps=1 \
+            #trainer.val_check_interval=2 \
+            #trainer.limit_val_batches=1 \
+            #trainer.accumulate_grad_batches=1 \
+            #trainer.max_steps=6 \
+            #trainer.gradient_clip_val=1.0 \
+            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            #exp_manager.resume_if_exists=True \
+            #model.tensor_model_parallel_size=2 \
+            #model.optim.name=fused_adam \
+            #model.optim.lr=2e-4 \
+            #model.optim.sched.warmup_steps=2 \
+            #model.optim.sched.constant_steps=2 \
+            #model.optim.sched.min_lr=8e-5 \
+            #model.max_position_embeddings=128 \
+            #model.encoder_seq_length=128 \
+            #model.data.seq_length=128 \
+            #model.position_embedding_type=alibi \
+            #model.normalization=rmsnorm \
+            #model.bias=False \
+            #model.bias_activation_fusion=False \
+            #model.bias_dropout_add_fusion=False \
+            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            #model.num_layers=8 \
+            #model.hidden_size=256 \
+            #model.num_attention_heads=8 \
+            #model.activations_checkpoint_method=block \
+            #model.activations_checkpoint_granularity=full \
+            #model.activations_checkpoint_num_layers=1 \
+            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+        
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
   L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=kerple \
-        model.normalization=rmsnorm \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-        # commented out to save time on github ci @adithyare
-        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        #trainer.devices=2 \
-        #trainer.accelerator=gpu \
-        #trainer.log_every_n_steps=1 \
-        #trainer.val_check_interval=2 \
-        #trainer.limit_val_batches=1 \
-        #trainer.accumulate_grad_batches=1 \
-        #trainer.max_steps=6 \
-        #trainer.precision=16 \
-        #trainer.gradient_clip_val=1.0 \
-        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        #exp_manager.resume_if_exists=True \
-        #model.tensor_model_parallel_size=2 \
-        #model.optim.name=fused_adam \
-        #model.optim.lr=2e-4 \
-        #model.optim.sched.warmup_steps=2 \
-        #model.optim.sched.constant_steps=2 \
-        #model.optim.sched.min_lr=8e-5 \
-        #model.max_position_embeddings=128 \
-        #model.encoder_seq_length=128 \
-        #model.data.seq_length=128 \
-        #model.position_embedding_type=kerple \
-        #model.normalization=rmsnorm \
-        #model.bias=False \
-        #model.bias_activation_fusion=False \
-        #model.bias_dropout_add_fusion=False \
-        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        #model.num_layers=8 \
-        #model.hidden_size=256 \
-        #model.num_attention_heads=8 \
-        #model.activations_checkpoint_method=block \
-        #model.activations_checkpoint_granularity=full \
-        #model.activations_checkpoint_num_layers=1 \
-        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.devices=2 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=2 \
+            trainer.limit_val_batches=2 \
+            trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=3 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.tensor_model_parallel_size=2 \
+            model.optim.name=fused_adam \
+            model.optim.lr=2e-4 \
+            model.optim.sched.warmup_steps=1 \
+            model.optim.sched.constant_steps=1 \
+            model.optim.sched.min_lr=8e-5 \
+            model.max_position_embeddings=128 \
+            model.encoder_seq_length=128 \
+            model.data.seq_length=128 \
+            model.position_embedding_type=kerple \
+            model.bias=False \
+            model.bias_activation_fusion=False \
+            model.bias_dropout_add_fusion=False \
+            model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            model.num_layers=8 \
+            model.hidden_size=256 \
+            model.num_attention_heads=8 \
+            model.activations_checkpoint_method=block \
+            model.activations_checkpoint_granularity=full \
+            model.activations_checkpoint_num_layers=1 \
+            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+            
+            # commented out to save time on github ci @adithyare
+            #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            #trainer.devices=2 \
+            #trainer.accelerator=gpu \
+            #trainer.log_every_n_steps=1 \
+            #trainer.val_check_interval=2 \
+            #trainer.limit_val_batches=1 \
+            #trainer.accumulate_grad_batches=1 \
+            #trainer.max_steps=6 \
+            #trainer.precision=16 \
+            #trainer.gradient_clip_val=1.0 \
+            #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            #exp_manager.resume_if_exists=True \
+            #model.tensor_model_parallel_size=2 \
+            #model.optim.name=fused_adam \
+            #model.optim.lr=2e-4 \
+            #model.optim.sched.warmup_steps=2 \
+            #model.optim.sched.constant_steps=2 \
+            #model.optim.sched.min_lr=8e-5 \
+            #model.max_position_embeddings=128 \
+            #model.encoder_seq_length=128 \
+            #model.data.seq_length=128 \
+            #model.position_embedding_type=kerple \
+            #model.normalization=rmsnorm \
+            #model.bias=False \
+            #model.bias_activation_fusion=False \
+            #model.bias_dropout_add_fusion=False \
+            #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+            #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+            #model.num_layers=8 \
+            #model.hidden_size=256 \
+            #model.num_attention_heads=8 \
+            #model.activations_checkpoint_method=block \
+            #model.activations_checkpoint_granularity=full \
+            #model.activations_checkpoint_num_layers=1 \
+            #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+            #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+            
+            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+            rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
   L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
@@ -3663,36 +2973,50 @@ jobs:
 
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        trainer.precision=32 \
-        trainer.max_steps=4 \
-        trainer.val_check_interval=4 \
-        trainer.enable_checkpointing=False \
-        +trainer.limit_val_batches=2 \
-        +trainer.limit_test_batches=2 \
-        exp_manager.checkpoint_callback_params.save_best_model=False \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
-        model.peft.peft_scheme=none \
-        model.optim.name=distributed_fused_adam \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.test_ds.num_workers=0 \
-        model.data.train_ds.concat_sampling_probabilities=[1.0]
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_sft_results
-
+    runs-on: self-hosted-azure-gpus-1
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+            python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.precision=bf16 \
+            trainer.max_steps=4 \
+            trainer.val_check_interval=4 \
+            trainer.enable_checkpointing=False \
+            +trainer.limit_val_batches=2 \
+            +trainer.limit_test_batches=2 \
+            exp_manager.checkpoint_callback_params.save_best_model=False \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \
+            model.peft.peft_scheme=none \
+            model.optim.name=distributed_fused_adam \
+            model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.num_workers=0 \
+            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.validation_ds.num_workers=0 \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.test_ds.num_workers=0 \
+            model.data.train_ds.concat_sampling_probabilities=[1.0]
+        
+            rm -rf examples/nlp/language_modeling/gpt_sft_results
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+  
   L2_Megatron_GPT_Embedding:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4545,75 +3869,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/bart_pretrain_results
 
-  # L2: Megatron T5 GLUE/XNLI Finetuning 
-  # TODO(Oktai15): update it in 1.8.0 version
-  L2_Megatron_T5_GLUE_RTE:  
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-        trainer.devices=1 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        +trainer.limit_val_batches=2 \
-        +trainer.limit_test_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-        model.pipeline_model_parallel_size=1 \
-        model.pipeline_model_parallel_split_rank=0 \
-        model.data.train_ds.task_name=rte \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.micro_batch_size=2 \
-        model.data.validation_ds.global_batch_size=2 \
-        model.data.validation_ds.micro_batch_size=2 \
-        model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-        model.data.validation_ds.task_name=rte \
-        model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_glue_results
-
-  L2_Megatron_T5_GLUE_XNLI:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
-        -cn megatron_t5_config_finetune_glue_xnli \
-        trainer.devices=1 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        +trainer.limit_val_batches=2 \
-        +trainer.limit_test_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-        model.pipeline_model_parallel_size=1 \
-        model.pipeline_model_parallel_split_rank=0 \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.micro_batch_size=2 \
-        model.data.validation_ds.global_batch_size=2 \
-        model.data.validation_ds.micro_batch_size=2 \
-        model.data.test_ds.global_batch_size=2 \
-        model.data.test_ds.micro_batch_size=2 \
-        model.data.train_ds.task_name=rte \
-        model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
-        model.data.validation_ds.task_name=xnli \
-        model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \
-        model.data.test_ds.task_name=xnli \
-        model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_xnli_results
-
+ 
   L2_Megatron_T5_PEFT_Lora_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4941,23 +4197,7 @@ jobs:
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
       - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-      - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA
-      - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel
-      - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel
-      - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel
-      - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier
-      - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel
-      - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel
-      - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel
-      - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel
       - L2_Duplex_Text_Normalization_with_Tarred_dataset
-      - L2_BERT_Text_Classification_with_BERT_Test
-      - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1
-      - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0
-      - L2_Parallel_BART_Question-Answering_SQUAD_v1_1
-      - L2_Parallel_BART_Question-Answering_SQUAD_v2_0
-      - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1
-      - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0
       - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
       - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
       - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test
@@ -4965,13 +4205,8 @@ jobs:
       - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1
       - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification
       - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation
-      - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data
-      - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset
-      - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir
-      - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text
       - L2_Pretraining_BERT_pretraining_from_Text
       - L2_Pretraining_BERT_from_Preprocessed
-      - L2_Entity_Linking_Self_Alignment_Pretraining_BERT
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN
       - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation
@@ -5013,8 +4248,6 @@ jobs:
       - L2_Megatron_T5_Eval
       - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2
-      - L2_Megatron_T5_GLUE_RTE
-      - L2_Megatron_T5_GLUE_XNLI
       - L2_Megatron_T5_PEFT_Lora_TP2
       - L2_Megatron_Mock_Data_Generation_MockGPTDataset
       - L2_Megatron_Mock_Data_Generation_MockT5Dataset
diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
index bc66ae717ebb..4eef38e715d4 100644
--- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -5,7 +5,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -41,7 +41,7 @@ exp_manager:
 
 model:
   # model parallelism 
-  mcore_bert: False
+  mcore_bert: True
   micro_batch_size: 4
   global_batch_size: 8
   tensor_model_parallel_size: 1
@@ -85,7 +85,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
   grad_div_ar_fusion: False 
 
@@ -158,4 +158,4 @@ model:
       name: CosineAnnealing
       warmup_steps: 500
       constant_steps: 50000
-      min_lr: 2e-5
\ No newline at end of file
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index ca0c3f74e4c8..1f63f7742ea0 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,7 +9,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -56,7 +56,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -121,7 +121,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
index 98d24802189e..92c56a4c20df 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py
@@ -17,6 +17,7 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueAssistantDataProcessor']
 
@@ -31,6 +32,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg):
             data_dir: path to data directory
             tokenizer: tokenizer object
         """
+        # deprecation warning
+        deprecated_warning("DialogueAssistantDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -69,16 +73,15 @@ def open_file(self, filename):
 
     @staticmethod
     def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids):
-
         """
         Extract continuous spans of slot_ids
 
-        To accomodate slots with distinct labels for B-label1 and I-label1, 
+        To accomodate slots with distinct labels for B-label1 and I-label1,
         slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1
-        
+
         Args:
             Slot: list of int representing slot of each word token
-            For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 
+            For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12
             Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday"
             Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens,
             each containing a start position and an exclusive end position
@@ -124,7 +127,7 @@ def map_bio_format_slots_to_unified_slots(slots):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
@@ -177,7 +180,11 @@ def get_dialog_examples(self, dataset_split: str):
                 "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words},
                 "label_positions": {
                     "slots": {
-                        slot: {"start": position[0], "exclusive_end": position[1], "slot": slot,}
+                        slot: {
+                            "start": position[0],
+                            "exclusive_end": position[1],
+                            "slot": slot,
+                        }
                         for slot, position in slot_to_start_and_exclusive_end.items()
                     }
                 },
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
index 2a4b21c70535..c41c1f5e04ca 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py
@@ -17,6 +17,7 @@
 import random
 
 from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueDataProcessor']
 
@@ -40,6 +41,9 @@ class DialogueDataProcessor(DataProcessor):
     """
 
     def __init__(self):
+        # deprecation warning
+        deprecated_warning("DialogueDataProcessor")
+
         raise NotImplementedError()
 
     def get_train_examples(self):
@@ -58,8 +62,8 @@ def get_test_examples(self):
     def get_relevant_idxs(dataset_split, n_samples, dev_proportion):
         """
         Obtain indexes for each dataset_split, when train and dev sets are not in separate files
-        
-        Args: 
+
+        Args:
             dataset_split: train, dev or test
             n_samples: total number of samples
             dev_proportion: value from 1 to 99 that represent proportion of data in dev set
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
index 5e58919b7652..56e99c4bcfe9 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py
@@ -19,6 +19,7 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueDesignDataProcessor']
 
@@ -34,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueDesignDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -50,7 +54,7 @@ def open_csv(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set
         Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
index 58814a8eee90..67d58ff5d21e 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py
@@ -19,13 +19,13 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueMellonQADataProcessor']
 
 
 class DialogueMellonQADataProcessor(DialogueDataProcessor):
-    """Data Processor for Mellon QA dialogues. 
-    """
+    """Data Processor for Mellon QA dialogues."""
 
     def __init__(self, data_dir: str, tokenizer: object, cfg=None):
         """
@@ -35,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueMellonQADataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -51,7 +54,7 @@ def open_csv(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
@@ -82,7 +85,11 @@ def get_dialog_examples(self, dataset_split: str):
             input_example = {
                 "utterance": utterance,
                 "example_id": i,
-                "labels": {"response": answer, "fluent_response": well_formed_answer, "passage": passage,},
+                "labels": {
+                    "response": answer,
+                    "fluent_response": well_formed_answer,
+                    "passage": passage,
+                },
             }
             example = DialogueInputExample(input_example)
             examples.append(example)
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
index 78f434c1d5dd..d09960a35d69 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py
@@ -19,15 +19,16 @@
 
 from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueMSMarcoDataProcessor']
 
 
 class DialogueMSMarcoDataProcessor(DialogueDataProcessor):
     """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering)
-       Please agree to the Terms of Use before downloading data at 
-       https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
-       https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
+    Please agree to the Terms of Use before downloading data at
+    https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
+    https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
     """
 
     def __init__(self, data_dir: str, tokenizer: object, cfg=None):
@@ -39,6 +40,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None):
             debug_mode: reduce number of samples to load in order to increase speed of processing
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueMSMarcoDataProcessor")
+
         self.data_dir = data_dir
         self._tokenizer = tokenizer
         self.cfg = cfg
@@ -55,7 +59,7 @@ def open_json(self, filename):
     def get_dialog_examples(self, dataset_split: str):
         """
         Process raw files into DialogueInputExample
-        Args: 
+        Args:
             dataset_split: {train, dev, test}
         For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set)
         Therefore, this function creates a dev set and a new train set from the train set.
diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
index a78e1973e55f..1d37c26f1c45 100644
--- a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
+++ b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py
@@ -28,6 +28,7 @@
 from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample
 from nemo.collections.nlp.data.dialogue.sgd.schema import Schema
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 from nemo.utils.get_rank import is_global_rank_zero
 
 __all__ = ['DialogueSGDDataProcessor']
@@ -51,7 +52,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
         #   git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git
 
     ***Data format***
-    SGD data comes with a JSON schema file and dialogue files for each dataset split. 
+    SGD data comes with a JSON schema file and dialogue files for each dataset split.
 
     In the following we will show an example for a service entry in the schema file.
     * service_name
@@ -70,7 +71,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
         * result_slots (not used)
 
 
-    In the following we will show an example for a dialogue. 
+    In the following we will show an example for a dialogue.
     * dialogue_id
     * services
     * turns
@@ -87,14 +88,18 @@ class DialogueSGDDataProcessor(DialogueDataProcessor):
             * state
                 * active_intent
                 * requeste_slots
-                * slot_values 
+                * slot_values
         * speaker - [USER, SYSTEM]
         * utterance
 
     """
 
     def __init__(
-        self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None,
+        self,
+        data_dir: str,
+        dialogues_example_dir: str,
+        tokenizer: object,
+        cfg=None,
     ):
         """
         Constructs DialogueSGDDataProcessor
@@ -104,6 +109,9 @@ def __init__(
             tokenizer: tokenizer object
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueSGDDataProcessor")
+
         self.data_dir = data_dir
         self.cfg = cfg
 
@@ -213,7 +221,7 @@ def get_labels(self):
 
     def get_dialog_examples(self, dataset_split: str) -> List[object]:
         """
-        Loads preprocessed dialogue examples from disk. 
+        Loads preprocessed dialogue examples from disk.
         Args:
             dataset_split: dataset split
         Returns:
@@ -260,7 +268,7 @@ def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsamp
         Returns a list of `InputExample`s of the data splits' dialogues.
         Args:
             dataset_split: data split, can be "train", "dev", or "test".
-            schemas: schema for all services of all datasets 
+            schemas: schema for all services of all datasets
             subsample: whether to balance postive and negative samples in the dataset
         Returns:
             examples: a list of `InputExample`s.
@@ -447,9 +455,9 @@ def _create_examples_from_turn(
                 "example_id_num": example_id_num,
                 "utterance": user_utterance,
                 "system_utterance": system_utterance,
-                "system_slots": {slot["slot"]: slot for slot in system_frame["slots"]}
-                if system_frame is not None
-                else None,
+                "system_slots": (
+                    {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None
+                ),
                 "system_actions": system_frame["actions"] if system_frame is not None else None,
                 "labels": {
                     "service": service,
@@ -464,9 +472,11 @@ def _create_examples_from_turn(
                         for intent in schemas.get_service_schema(service).intents
                     ],
                     "slots": {
-                        slot: schemas.get_service_schema(service).get_categorical_slot_values(slot)
-                        if slot in categorical_slots
-                        else []
+                        slot: (
+                            schemas.get_service_schema(service).get_categorical_slot_values(slot)
+                            if slot in categorical_slots
+                            else []
+                        )
                         for slot in all_possible_slots
                     },
                 },
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
index 0931fe383f94..33d46c308e81 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py
@@ -21,12 +21,12 @@
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
 from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset']
 
 
 class DialogueBERTDataset(DialogueDataset):
-
     """
     Creates a dataset to use for the task of joint intent
     and slot classification with pretrained model.
@@ -37,8 +37,7 @@ class DialogueBERTDataset(DialogueDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'input_ids': NeuralType(('B', 'T'), ChannelType()),
             'segment_ids': NeuralType(('B', 'T'), ChannelType()),
@@ -57,6 +56,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             tokenizer: tokenizer
             cfg: config container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueBERTDataset")
+
         self.cfg = cfg
         self.all_possible_labels = dialogues_processor.intents
         self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))}
@@ -183,7 +185,7 @@ def get_features(
         ignore_start_end=False,
     ):
         """
-        Convert queries (utterance, intent label and slot labels) to BERT input format 
+        Convert queries (utterance, intent label and slot labels) to BERT input format
         """
 
         all_subtokens = []
@@ -297,7 +299,7 @@ class DialogueIntentSlotInferenceDataset(DialogueBERTDataset):
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
         """
-            Returns definitions of module output ports.
+        Returns definitions of module output ports.
         """
         return {
             'input_ids': NeuralType(('B', 'T'), ChannelType()),
@@ -308,6 +310,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         }
 
     def __init__(self, queries, max_seq_length, tokenizer, do_lower_case):
+        # deprecation warning
+        deprecated_warning("DialogueIntentSlotInferenceDataset")
+
         if do_lower_case:
             queries = [query.lower() for query in queries]
 
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
index 1ac04a856a89..f89a5013c2ae 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py
@@ -21,27 +21,31 @@
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class DialogueGPTClassificationDataset(DialogueDataset):
     '''
     Designed for classification tasks such as intent/domain classification as well as slot tagging
 
-    Dataset Class 
+    Dataset Class
         1. Performs Model-dependent (but Data-independent) operations (tokenization etc)
         2. This can allow the same model preprocessing for multiple datasources
-        3. Users can configurate which labels to use for modelling 
+        3. Users can configurate which labels to use for modelling
             (e.g. intent classification, slot filling or both together etc)
     '''
 
     def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """ Constructor
+        """Constructor
         Args:
             dataset_split: dataset split
             dialogues_processor: Data generator for SGD dialogues
             tokenizer: tokenizer
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueGPTClassificationDataset")
+
         self.cfg = cfg
 
         if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation":
@@ -229,19 +233,18 @@ def collate_fn(self, batch):
         return all_items
 
     def __getitem__(self, idx: int):
-
         '''
         State how the input and output samples look like
 
         This template can be changed
 
-        Training example: 
+        Training example:
             e.g. <utterance> service: restaurant
             e.g. <task description> <utterance> service: restaurant
             e.g. <utterance>\nintent: set alarm\nslots: <slot_name1>(<slot_value1>), <slot_name1>(<slot_value1>)
 
         Generation example:
-            e.g. <utterance> service: 
+            e.g. <utterance> service:
 
         '''
         ex = self.features[idx].data
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
index 7de02d75c574..8ddbc2e3925e 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py
@@ -18,12 +18,13 @@
 import torch
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
+from nemo.utils.decorators import deprecated_warning
 
 
 class DialogueGPTGenerationDataset(DialogueDataset):
     def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """ Constructor
-        Designed for free form generation tasks such as Dialogue Response Generation 
+        """Constructor
+        Designed for free form generation tasks such as Dialogue Response Generation
 
         Args:
             dataset_split: dataset split
@@ -31,6 +32,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             tokenizer: tokenizer
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueGPTGenerationDataset")
+
         self.cfg = cfg
         self.input_label_type = self.cfg.input_field
         self.output_label_type = self.cfg.output_field
@@ -80,7 +84,7 @@ def format_prompt(self, ex):
         '''
         Formats training prompt based on self.input_field_type
 
-        Training example: 
+        Training example:
             e.g. response: <response> # input_label_type = response
             e.g. utterance: <utterance> # input_label_type = utterance
             e.g. passage: <passage> utterance: <utterance> # input_label_type = passage+utterance
@@ -91,7 +95,6 @@ def format_prompt(self, ex):
         return input_sentence
 
     def __getitem__(self, idx: int):
-
         '''
         For each example, this function determines the format of input and output sequences based on user-specified conguration.
         This is controlled by model.dataset.input_field and model.dataset.output_field
@@ -99,9 +102,9 @@ def __getitem__(self, idx: int):
             If model.dataset.input_field == response and model.dataset.output_field == fluent_response:
                 Input = "response: <response>" and output = "response: <response> fluent_response: <fluent_response>" (with loss calculated from <fluent_response> only)
             If model.dataset.input_field == utterance and model.dataset.output_field == response:
-                Input = "utterance: <utterance>" and output = "utterance: <utterance> response: <response>" (with loss calculated from <response> only) 
+                Input = "utterance: <utterance>" and output = "utterance: <utterance> response: <response>" (with loss calculated from <response> only)
             If model.dataset.input_field == passage+utterance and model.dataset.output_field == response:
-                Input = "passage: <passage> utterance: <utterance>" and output="passage: <passage> utterance: <utterance> response: <response>" (with loss calculated from <response> only) 
+                Input = "passage: <passage> utterance: <utterance>" and output="passage: <passage> utterance: <utterance> response: <response>" (with loss calculated from <response> only)
         '''
         ex = self.features[idx].data
 
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
index 8618f2f8c7b4..dc123ca0e3d7 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py
@@ -17,6 +17,7 @@
 import torch
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueNearestNeighbourDataset']
 
@@ -33,6 +34,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             dialogues_processor: Data generator for dialogues
             tokenizer: tokenizer to split text into sub-word tokens
         """
+        # deprecation warning
+        deprecated_warning("DialogueNearestNeighbourDataset")
+
         self.cfg = cfg
         self.tokenizer = tokenizer
         self.raw_features = dialogues_processor.get_dialog_examples(dataset_split)
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
index 78fda55edd2e..df522b74e861 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py
@@ -16,12 +16,13 @@
 import torch
 
 from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset
+from nemo.utils.decorators import deprecated_warning
 
 
 class DialogueS2SGenerationDataset(DialogueDataset):
     def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg):
-        """ Constructor
-        Designed for free form generation tasks such as Dialogue Response Generation 
+        """Constructor
+        Designed for free form generation tasks such as Dialogue Response Generation
 
         Args:
             dataset_split: dataset split
@@ -29,6 +30,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             tokenizer: tokenizer
             cfg: cfg container for dataset
         """
+        # deprecation warning
+        deprecated_warning("DialogueS2SGenerationDataset")
+
         self.cfg = cfg
         self.input_label_type = self.cfg.input_field
         self.output_label_type = self.cfg.output_field
@@ -45,7 +49,7 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
     @staticmethod
     def format_actions(prompt_template, actions):
         """
-        Formats actions based on prompt_template 
+        Formats actions based on prompt_template
 
         Args:
             prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions
@@ -118,7 +122,7 @@ def format_prompt(self, ex):
         '''
         Formats training prompt based on self.input_field_type
 
-        Training example: 
+        Training example:
             e.g. response: <response> # input_label_type = response
             e.g. utterance: <utterance> # input_label_type = utterance
             e.g. passage: <passage> utterance: <utterance> # input_label_type = passage+utterance
@@ -128,13 +132,12 @@ def format_prompt(self, ex):
         return input_sentence
 
     def __getitem__(self, idx: int):
-
         '''
         State how the input and output samples look like
 
         This template can be changed
 
-        Training example: 
+        Training example:
             e.g. INPUT - "response: <response>" OUTPUT - "<fluent_response>"  # input_label_type = response, output_label_type = fluent_response
             e.g. INPUT - "utterance: <utterance>" OUTPUT - "<response>" # input_label_type = utterance, output_label_type = response
             e.g. INPUT - "passage: <passage> utterance: <utterance>" OUTPUT - "<response>" # input_label_type = passage+utterance, output_label_type = response
diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
index f2a0f58bcfac..c1308238bea1 100644
--- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
+++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py
@@ -23,6 +23,7 @@
 from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset
 from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueZeroShotIntentDataset']
 
@@ -36,8 +37,7 @@ class DialogueZeroShotIntentDataset(GLUEDataset):
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        """Returns definitions of module output ports.
-               """
+        """Returns definitions of module output ports."""
         return {
             'input_ids': NeuralType(('B', 'T'), ChannelType()),
             'segment_ids': NeuralType(('B', 'T'), ChannelType()),
@@ -55,6 +55,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
                 num_classes: number of classes in the data (should be either 2 or 3, corresponding to
                 labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"])
         """
+        # deprecation warning
+        deprecated_warning("DialogueZeroShotIntentDataset")
+
         self.cfg = cfg
         self.tokenizer = tokenizer
         if self.cfg.num_classes not in [2, 3]:
@@ -69,9 +72,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c
             'eos_token': tokenizer.eos_token,
             'pad_token': tokenizer.pad_token,
             'cls_token': tokenizer.cls_token,
-            'sep_token_extra': tokenizer.eos_token
-            if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower()
-            else None,
+            'sep_token_extra': (
+                tokenizer.eos_token if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() else None
+            ),
         }
 
         self.raw_features = dialogues_processor.get_dialog_examples(dataset_split)
@@ -128,9 +131,9 @@ def convert_examples_to_features(
             * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
 
         The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-        
+
         The convention in BERT is:
-        
+
             a. For sequence pairs:
                 * tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
                 * type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
@@ -148,9 +151,9 @@ def convert_examples_to_features(
         For classification tasks, the first vector (corresponding to [CLS])
         is used as as the "sentence vector". Note that this only makes sense
         because the entire model is fine-tuned.
-        
+
         The convention for NMT is:
-        
+
             a. For sequence pairs:
                 * tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
                 * type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
index 5d985466ff6c..bbd14f47a651 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py
@@ -17,6 +17,7 @@
 from nemo.collections.nlp.modules.common import VirtualPromptSource
 from nemo.core import Dataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['BasePromptLearningDataset']
 
@@ -41,6 +42,9 @@ def __init__(
         add_eos: bool = True,
         for_train: bool = True,
     ):
+        # deprecation warning
+        deprecated_warning("BasePromptLearningDataset")
+
         self.tokenizer = tokenizer
         self.virtual_prompt_source = virtual_prompt_source
         self.task_templates = task_templates
@@ -72,7 +76,7 @@ def __init__(
             raise ValueError("Datasets must be a list of dicts or a list of filepath strings")
 
     def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits):
-        """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """
+        """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers"""
         total_inserted_tokens = 0
 
         for idx in range(len(virtual_token_splits)):
@@ -85,7 +89,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits
         return input_example
 
     def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virtual_tokens=0):
-        """ Try to truncate input text to fit into the max sequence length """
+        """Try to truncate input text to fit into the max sequence length"""
         logging.info(
             f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'"
         )
@@ -115,7 +119,7 @@ def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virt
         return input_ids
 
     def _add_leading_space(self, taskname, field_name, field_text):
-        """ Add leading space to text if there is a space before it in the template """
+        """Add leading space to text if there is a space before it in the template"""
         prompt_template = self.task_templates[taskname]["prompt_template"]
         field_text_start = prompt_template.find("{" + field_name + "}")
         if field_text_start != 0 and prompt_template[field_text_start - 1] == " ":
@@ -187,11 +191,11 @@ def pad_taskname_ids(self, taskname_ids):
 
 
 def find_subsequence_location(sequence, subsequence):
-    """ Finds the start and end index of the first occurance 
-        of a given subsequence within a larger list. Returns 
-        the two indices corresponding to the postition of 
-        the first and last token of the subseqeunce.
-        Assumes subsequence is known to be in sequence. 
+    """Finds the start and end index of the first occurance
+    of a given subsequence within a larger list. Returns
+    the two indices corresponding to the postition of
+    the first and last token of the subseqeunce.
+    Assumes subsequence is known to be in sequence.
     """
     assert len(sequence) >= len(subsequence), "subsequence too long"
 
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
index 4b1b4f61d439..11795bd150f1 100755
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py
@@ -23,6 +23,7 @@
 from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
 from nemo.core import Dataset
 from nemo.utils import AppState, logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['GPTPromptLearningDataset']
 
@@ -30,7 +31,7 @@
 class GPTPromptLearningDataset(Dataset):
     """
     The dataset class for prompt-tuning or p-tuning pretrained GPT models.
-    
+
     Args:
         data (list[strings], list[dicts]): (1) paths to .jsonl or .json files, (2) dict objects corresponding to each input example
         tokenizer (tokenizer): Tokenizer from frozen language model
@@ -39,7 +40,7 @@ class GPTPromptLearningDataset(Dataset):
         pseudo_tokens (list[strings]): A list of virtual prompt token placeholders e.g [<prompt_1>, <prompt_2>, ...] up to max num virtual tokens
         pad_token_id (int): ID of pad token from tokenizer
         max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
-        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. 
+        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
         add_bos (bool): Whether to add a beginning of sentence token to each data example
         add_eos (bool): Whether to add an end of sentence token to each data example
         for_train (bool): Whether you're creating a dataset for training or inference
@@ -63,6 +64,9 @@ def __init__(
         cache_data_path: str = None,  # the cache file
         load_cache: bool = True,  # whether to load from the cache if it is available
     ):
+        # deprecation warning
+        deprecated_warning("GPTPromptLearningDataset")
+
         self.tokenizer = tokenizer
         self.virtual_prompt_source = virtual_prompt_source
         self.task_templates = task_templates
@@ -112,9 +116,9 @@ def __init__(
     def load_data(self, dataset):
         """
         Loads a dataset by filling in the task templates specified in the config file
-        with the information from each training/inference example. Converts all input 
-        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in 
-        the task templates with the actual virtual prompt token ids. 
+        with the information from each training/inference example. Converts all input
+        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in
+        the task templates with the actual virtual prompt token ids.
 
         params:
             dataset: A list of json objects or a dictionary objects each
@@ -241,7 +245,7 @@ def _input_sanity_checks(
             assert prompt_template[placeholder_start:] == answer_placeholder, "Answer field must be at prompt end"
 
     def _insert_text_in_template(self, input_example, prompt_template_fields, doc):
-        """ Format the input example according to the template """
+        """Format the input example according to the template"""
         for field in prompt_template_fields:
             if field in doc.keys():
                 field_text = doc[field]
@@ -255,7 +259,7 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc):
         return input_example.strip(" ")
 
     def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits):
-        """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """
+        """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers"""
         total_inserted_tokens = 0
 
         for idx in range(len(virtual_token_splits)):
@@ -270,7 +274,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits
     def _truncate_input(
         self, truncation_field, input_ids, taskname, doc, prompt_template, prompt_template_fields, virtual_token_splits
     ):
-        """ Try to truncate input text to fit into the max sequence length """
+        """Try to truncate input text to fit into the max sequence length"""
         logging.info(
             f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'"
         )
@@ -297,8 +301,8 @@ def _truncate_input(
         return input_ids
 
     def _find_answer_start(self, taskname, input_ids, answer_field, doc):
-        """ Find the token ids corresponding to the answer start, for loss masking purposes.
-            Assumes the answer is always at the end of the prompt.
+        """Find the token ids corresponding to the answer start, for loss masking purposes.
+        Assumes the answer is always at the end of the prompt.
         """
         answer_text = doc[answer_field]
         answer_text = self._add_leading_space(taskname, answer_field, answer_text)
@@ -313,7 +317,7 @@ def _find_answer_start(self, taskname, input_ids, answer_field, doc):
         return answer_start_idx
 
     def _add_leading_space(self, taskname, field_name, field_text):
-        """ Add leading space to text if there is a space before it in the template """
+        """Add leading space to text if there is a space before it in the template"""
         prompt_template = self.task_templates[taskname]["prompt_template"]
         field_text_start = prompt_template.find("{" + field_name + "}")
         if field_text_start != 0 and prompt_template[field_text_start - 1] == " ":
@@ -331,7 +335,7 @@ def _ceil_to_nearest(self, n, m):
         return (n + m - 1) // m * m
 
     def collate_fn(self, batch, tp_workers=0):
-        """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """
+        """Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch"""
         taskname_ids, input_ids, answer_starts = zip(*batch)
 
         # Pad taskname_ids to be the same length for the prompt encoder
@@ -380,7 +384,7 @@ def collate_fn(self, batch, tp_workers=0):
         return input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids
 
     def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts):
-        """ Pad input_ids in batch to max batch length while building loss mask """
+        """Pad input_ids in batch to max batch length while building loss mask"""
         batch_loss_masks = []
         padded_input_ids = []
         for ids, answer_start_idx in zip(input_ids, answer_starts):
@@ -410,7 +414,7 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts):
 
     def inference_collate_fn(self, batch):
         """
-        Used for loading inference data. 
+        Used for loading inference data.
         """
         task_id_nums, input_ids, answer_starts = zip(*batch)
         input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py
index 4070098b5e67..87174b69ffc2 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py
@@ -22,10 +22,11 @@
 from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
 from nemo.collections.nlp.data.question_answering.input_example.qa_bert_input_example import BERTQAInputExample
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class BERTQADataset(QADataset):
-    """ Creates a Dataset for BERT architecture based Exractive QA """
+    """Creates a Dataset for BERT architecture based Exractive QA"""
 
     def __init__(
         self,
@@ -41,6 +42,9 @@ def __init__(
         mode: str = TRAINING_MODE,
         use_cache: bool = False,
     ):
+        # deprecation warning
+        deprecated_warning("BERTQADataset")
+
         super().__init__(
             data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples
         )
@@ -92,7 +96,7 @@ def __init__(
             self.features[i] = BERTQAInputExample(**self.features[i])
 
     def _set_cached_features_filename(self):
-        """ Creates cache filename using dataset config parameters """
+        """Creates cache filename using dataset config parameters"""
 
         vocab_size = getattr(self.tokenizer, "vocab_size", 0)
         self.cached_features_file = (
@@ -110,7 +114,7 @@ def _set_cached_features_filename(self):
         )
 
     def _convert_examples_to_features(self):
-        """ Converts loaded examples to features """
+        """Converts loaded examples to features"""
 
         logging.info(f"Preprocessing data into features.")
 
@@ -161,7 +165,7 @@ def _convert_examples_to_features(self):
                 example.doc_tokens = doc_tokens
 
             # the text to tokens step is the slowest step
-            for (i, token) in enumerate(doc_tokens):
+            for i, token in enumerate(doc_tokens):
                 orig_to_tok_index.append(len(all_doc_tokens))
                 if token not in text_to_tokens_dict:
                     text_to_tokens_dict[token] = self.tokenizer.text_to_tokens(token)
@@ -199,7 +203,7 @@ def _convert_examples_to_features(self):
             # make compatible for hashing
             doc_spans = tuple(doc_spans)
 
-            for (doc_span_index, doc_span) in enumerate(doc_spans):
+            for doc_span_index, doc_span in enumerate(doc_spans):
 
                 tokens = [self.tokenizer.cls_token] + query_tokens + [self.tokenizer.sep_token]
                 segment_ids = [0 for i in range(len(tokens))]
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py
index 783b2dd33f31..553f5984952c 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py
@@ -28,14 +28,24 @@
 )
 from nemo.core.classes import Dataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class QADataset(Dataset):
-    ''' Abstract base class for QA Datasets with common utility methods '''
+    '''Abstract base class for QA Datasets with common utility methods'''
 
     def __init__(
-        self, data_file: str, processor: object, tokenizer: object, mode: str, num_samples: int, **kwargs,
+        self,
+        data_file: str,
+        processor: object,
+        tokenizer: object,
+        mode: str,
+        num_samples: int,
+        **kwargs,
     ):
+        # deprecation warning
+        deprecated_warning("QADataset")
+
         self.mode = mode
         self.data_file = data_file
         self.processor = processor
@@ -100,7 +110,7 @@ def get_best_span_index(doc_spans, position):
 
         best_score = None
         best_span_index = None
-        for (span_index, doc_span) in enumerate(doc_spans):
+        for span_index, doc_span in enumerate(doc_spans):
             end = doc_span.start + doc_span.length - 1
             if position < doc_span.start:
                 continue
@@ -150,7 +160,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride):
             all_doc_tokens: list of all tokens in document
             max_tokens_for_doc: maximum number of tokens in each doc span
             doc_stride: stride size which sliding window moves with
-        
+
         Returns:
             doc_spans: all possible doc_spans from document
         """
@@ -179,7 +189,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
             doc_span
             tok_start_position: start position of answer in document
             tok_end_position: end position of answer in document
-        
+
         Returns:
             average distance of doc_span to answer
         """
@@ -193,7 +203,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
     @staticmethod
     def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode):
         """
-        Filters out doc_spans, which might not be relevant to answering question, 
+        Filters out doc_spans, which might not be relevant to answering question,
         which can be helpful when document is extremely long leading to many doc_spans with no answers
 
         Args:
@@ -204,7 +214,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode
                 all: do not filter
                 only_positive: only keep doc_spans containing the answer
                 limited_negative: only keep 10 doc_spans that are nearest to answer
-        
+
         Returns:
             doc_spans: doc_spans after filtering
         """
@@ -282,9 +292,13 @@ def get_doc_tokens_and_offset_from_context_id(
 
     @staticmethod
     def improve_answer_span(
-        doc_tokens: List[str], input_start: int, input_end: int, tokenizer: object, orig_answer_text: str,
+        doc_tokens: List[str],
+        input_start: int,
+        input_end: int,
+        tokenizer: object,
+        orig_answer_text: str,
     ):
-        """ Returns tokenized answer spans that better match the annotated answer """
+        """Returns tokenized answer spans that better match the annotated answer"""
 
         tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text))
 
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py
index d6484b33e202..1eeb312a62a9 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py
@@ -24,10 +24,11 @@
 from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
 from nemo.collections.nlp.data.question_answering.input_example.qa_gpt_input_example import GPTQAInputExample
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class GPTQADataset(QADataset):
-    """ Creates a Dataset for GPT architecture based Generative QA """
+    """Creates a Dataset for GPT architecture based Generative QA"""
 
     def __init__(
         self,
@@ -44,6 +45,9 @@ def __init__(
         mode: str = TRAINING_MODE,
         use_cache: bool = False,
     ):
+        # deprecation warning
+        deprecated_warning("GPTQADataset")
+
         super().__init__(
             data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples
         )
@@ -76,7 +80,7 @@ def __init__(
             self.features[i] = GPTQAInputExample(**self.features[i])
 
     def _set_cached_features_filename(self):
-        """ Creates cache filename using dataset config parameters """
+        """Creates cache filename using dataset config parameters"""
 
         vocab_size = getattr(self.tokenizer, "vocab_size", 0)
         self.cached_features_file = (
@@ -120,7 +124,11 @@ def _convert_examples_to_features(self):
             formatted_query, query_tokens_length = self._prep_query(query_prefix, example)
             formatted_answer, answer_tokens_length = self._prep_answer(example)
             context_tokens, context_spans = self._prep_context(
-                example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens,
+                example,
+                query_tokens_length,
+                answer_tokens_length,
+                context_prefix_tokens,
+                answer_prefix_tokens,
             )
 
             unique_id = self._encode_all_context_spans(
@@ -170,7 +178,12 @@ def _prep_answer(self, example):
         return self._get_truncated_sentence_and_len(target, self.max_answer_length)
 
     def _prep_context(
-        self, example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens,
+        self,
+        example,
+        query_tokens_length,
+        answer_tokens_length,
+        context_prefix_tokens,
+        answer_prefix_tokens,
     ):
         """
         Calculates the maximum possible length for a given context given a question
diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py
index 1f9a8ef615a9..c65c8a43c440 100644
--- a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py
+++ b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py
@@ -23,10 +23,11 @@
 from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
 from nemo.collections.nlp.data.question_answering.input_example.qa_s2s_input_example import S2SQAInputExample
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class S2SQADataset(QADataset):
-    """ Creates a Dataset for T5/BART architecture based Generative QA """
+    """Creates a Dataset for T5/BART architecture based Generative QA"""
 
     def __init__(
         self,
@@ -43,6 +44,9 @@ def __init__(
         mode: str = TRAINING_MODE,
         use_cache: bool = False,
     ):
+        # deprecation warning
+        deprecated_warning("S2SQADataset")
+
         super().__init__(
             data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples
         )
@@ -75,7 +79,7 @@ def __init__(
             self.features[i] = S2SQAInputExample(**self.features[i])
 
     def _set_cached_features_filename(self):
-        """ Creates cache filename using dataset config parameters """
+        """Creates cache filename using dataset config parameters"""
 
         vocab_size = getattr(self.tokenizer, "vocab_size", 0)
         self.cached_features_file = (
@@ -117,7 +121,12 @@ def _convert_examples_to_features(self):
             context_tokens, context_spans = self._prep_context(example, query_tokens, context_prefix_tokens)
 
             unique_id = self._encode_all_context_spans(
-                unique_id, context_spans, context_tokens, formatted_query, example, example_index,
+                unique_id,
+                context_spans,
+                context_tokens,
+                formatted_query,
+                example,
+                example_index,
             )
 
         # delete self.examples during training mode to save memory
@@ -155,7 +164,13 @@ def _prep_context(self, example, query_tokens, context_prefix_tokens):
         return context_tokens, context_spans
 
     def _encode_all_context_spans(
-        self, unique_id, context_spans, context_tokens, formatted_query, example, example_index,
+        self,
+        unique_id,
+        context_spans,
+        context_tokens,
+        formatted_query,
+        example,
+        example_index,
     ):
         """
         Fromats all spans extracted from a single context as:
@@ -173,7 +188,11 @@ def _encode_all_context_spans(
 
             # encode input
             encoded_input_dict = self.tokenizer.tokenizer(
-                source, truncation=True, max_length=self.max_seq_length, padding="max_length", return_tensors="pt",
+                source,
+                truncation=True,
+                max_length=self.max_seq_length,
+                padding="max_length",
+                return_tensors="pt",
             )
             input_ids = torch.squeeze(encoded_input_dict["input_ids"])
             input_attn_mask = torch.squeeze(encoded_input_dict["attention_mask"])
@@ -223,7 +242,11 @@ def _encode_answer(self, example, context_span_text):
             target = example.answer_text
 
         encoded_output_dict = self.tokenizer.tokenizer(
-            target, truncation=True, max_length=self.max_answer_length, padding="max_length", return_tensors="pt",
+            target,
+            truncation=True,
+            max_length=self.max_answer_length,
+            padding="max_length",
+            return_tensors="pt",
         )
         labels = torch.squeeze(encoded_output_dict["input_ids"])
         labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100
diff --git a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py
index ee1a0957dbbb..2abe9b7c0aaa 100644
--- a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py
+++ b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py
@@ -46,6 +46,7 @@
 )
 from nemo.core.classes import Dataset
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['SquadDataset', 'InputFeatures', '_check_is_max_context']
 
@@ -114,7 +115,7 @@ def get_best_span_index(doc_spans, position):
     """
     best_score = None
     best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
+    for span_index, doc_span in enumerate(doc_spans):
         end = doc_span.start + doc_span.length - 1
         if position < doc_span.start:
             continue
@@ -165,6 +166,9 @@ def __init__(
         mode: str,
         use_cache: bool,
     ):
+        # deprecation warning
+        deprecated_warning("SquadDataset")
+
         self.tokenizer = tokenizer
         self.version_2_with_negative = version_2_with_negative
         self.processor = SquadProcessor(data_file=data_file, mode=mode)
@@ -337,7 +341,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride):
             all_doc_tokens: list of all tokens in document
             max_tokens_for_doc: maximum number of tokens in each doc span
             doc_stride: stride size which sliding window moves with
-        
+
         Returns:
             doc_spans: all possible doc_spans from document
         """
@@ -375,7 +379,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
             doc_span
             tok_start_position: start position of answer in document
             tok_end_position: end position of answer in document
-        
+
         Returns:
             average distance of doc_span to answer
         """
@@ -387,7 +391,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_
     @staticmethod
     def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode):
         """
-        Filters out doc_spans, which might not be relevant to answering question, 
+        Filters out doc_spans, which might not be relevant to answering question,
         which can be helpful when document is extremely long leading to many doc_spans with no answers
 
         Args:
@@ -398,7 +402,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode
                 all: do not filter
                 only_positive: only keep doc_spans containing the answer
                 limited_negative: only keep 10 doc_spans that are nearest to answer
-        
+
         Returns:
             doc_spans: doc_spans after filtering
         """
@@ -481,7 +485,7 @@ def convert_examples_to_features(
             if self.mode != TRAINING_MODE:
                 example.doc_tokens = doc_tokens
             # the text to tokens step is the slowest step
-            for (i, token) in enumerate(doc_tokens):
+            for i, token in enumerate(doc_tokens):
                 orig_to_tok_index.append(len(all_doc_tokens))
                 if token not in text_to_tokens_dict:
                     text_to_tokens_dict[token] = tokenizer.text_to_tokens(token)
@@ -521,7 +525,7 @@ def convert_examples_to_features(
             # make compatible for hashing
             doc_spans = tuple(doc_spans)
 
-            for (doc_span_index, doc_span) in enumerate(doc_spans):
+            for doc_span_index, doc_span in enumerate(doc_spans):
 
                 tokens = [tokenizer.cls_token] + query_tokens + [tokenizer.sep_token]
                 segment_ids = [0 for i in range(len(tokens))]
@@ -681,7 +685,7 @@ def get_predictions(
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
         scores_diff_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(self.examples):
+        for example_index, example in enumerate(self.examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -706,7 +710,7 @@ def get_predictions(
             null_start_logit = 0
             # end logit at the slice with min null score
             null_end_logit = 0
-            for (feature_index, feature) in enumerate(features):
+            for feature_index, feature in enumerate(features):
                 pos = unique_id_to_pos[feature.unique_id]
                 start_indexes = get_best_indexes(start_logits[pos], n_best_size)
                 end_indexes = get_best_indexes(end_logits[pos], n_best_size)
@@ -825,7 +829,7 @@ def get_predictions(
             probs = _compute_softmax(total_scores)
 
             nbest_json = []
-            for (i, entry) in enumerate(nbest):
+            for i, entry in enumerate(nbest):
                 output = collections.OrderedDict()
                 output["question"] = example.question_text
                 output["text"] = entry.text
diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
index 803d0eaf8aed..c98abb300c64 100644
--- a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
+++ b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py
@@ -20,6 +20,8 @@
 
 from transformers import PreTrainedTokenizerBase
 
+from nemo.utils.decorators import deprecated_warning
+
 """Build BERT Examples from asr hypothesis, customization candidates, target labels, span info.
 """
 
@@ -52,7 +54,7 @@ def __init__(
             input_ids: indices of single characters (treated as subwords)
             input_mask: list of bools with 0s in place of input_ids to be masked
             segment_ids: list of ints from 0 to 10 to denote the text segment type (
-                0 - for tokens of ASR hypothesis, 
+                0 - for tokens of ASR hypothesis,
                 1 - for tokens of the first candidate
                 ...
                 10 - for tokens of the tenth candidate
@@ -60,7 +62,7 @@ def __init__(
             input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer)
             input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked
             segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords
-            character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) 
+            character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords)
             fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set
             labels_mask: bool tensor with 0s in place of label tokens to be masked
             labels: indices of semiotic classes which should be predicted from each of the
@@ -68,6 +70,9 @@ def __init__(
             spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM)
             default_label: The default label
         """
+        # deprecation warning
+        deprecated_warning("BertExample")
+
         input_len = len(input_ids)
         if not (
             input_len == len(input_mask)
@@ -123,6 +128,9 @@ def __init__(
             tokenizer: Tokenizer object.
             max_seq_length: Maximum sequence length.
         """
+        # deprecation warning
+        deprecated_warning("BertExampleBuilder")
+
         self._label_map = label_map
         self._semiotic_classes = semiotic_classes
         self._tokenizer = tokenizer
@@ -183,9 +191,15 @@ def build_bert_example(
                 tags[start:end] = [t for i in range(end - start)]
 
         # get input features for characters
-        (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = self._get_input_features(
-            hyp=hyp, ref=ref, tags=tags
-        )
+        (
+            input_ids,
+            input_mask,
+            segment_ids,
+            labels_mask,
+            labels,
+            _,
+            _,
+        ) = self._get_input_features(hyp=hyp, ref=ref, tags=tags)
 
         # get input features for words
         hyp_with_words = hyp.replace(" ", "").replace("_", " ")
@@ -243,11 +257,11 @@ def build_bert_example(
         return example
 
     def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]:
-        """ Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample
-            
-            Example:
-                span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"]
-                result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)]
+        """Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample
+
+        Example:
+            span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"]
+            result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)]
         """
         result_spans = []
 
@@ -267,26 +281,26 @@ def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]:
     def _get_fragment_indices(
         self, hyp: str, targets: List[int], span_info_parts: List[str]
     ) -> Tuple[List[Tuple[int, int, int]]]:
-        """ Build fragment indices for real candidates.
-            This is used only at inference.
-            After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams).
-            In this function we 
-               1) adjust start/end positions to match word borders (possibly in multiple ways). 
-               2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). 
-
-            Args:
-                hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore).
-                targets: list of candidate ids (only for real candidates, not dummy)
-                span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text.
-            Returns:
-                List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id.
-                Note that returned fragments can be unsorted and can overlap, it's ok.
-            Example:
-                hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
-                targets: [1 2 3 4 6 7 9]
-                span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion.
-                fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)]
-            """
+        """Build fragment indices for real candidates.
+        This is used only at inference.
+        After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams).
+        In this function we
+           1) adjust start/end positions to match word borders (possibly in multiple ways).
+           2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment).
+
+        Args:
+            hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore).
+            targets: list of candidate ids (only for real candidates, not dummy)
+            span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text.
+        Returns:
+            List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id.
+            Note that returned fragments can be unsorted and can overlap, it's ok.
+        Example:
+            hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
+            targets: [1 2 3 4 6 7 9]
+            span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion.
+            fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)]
+        """
 
         fragment_indices = []
 
@@ -337,18 +351,18 @@ def _get_fragment_indices(
         return fragment_indices
 
     def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]:
-        """ Maps each single character to the position of its corresponding subword.
-
-            Args:
-                input_ids: List of character token ids.
-                input_ids_for_subwords: List of subword token ids.
-            Returns:
-                List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids)
-
-            Example:
-                input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102]
-                input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102]
-                result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47]
+        """Maps each single character to the position of its corresponding subword.
+
+        Args:
+            input_ids: List of character token ids.
+            input_ids_for_subwords: List of subword token ids.
+        Returns:
+            List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids)
+
+        Example:
+            input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102]
+            input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102]
+            result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47]
         """
         character_pos_to_subword_pos = [0 for _ in input_ids]
 
@@ -453,7 +467,7 @@ def _get_input_features(
             ref:  "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy"
             tags: None (not used for word-based case)
 
-            resulting token sequence: 
+            resulting token sequence:
                 '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]']
         """
 
@@ -542,9 +556,9 @@ def read_input_file(
             infer: If true, input examples do not contain target info.
 
         Returns:
-            examples: List of converted examples (BertExample). 
+            examples: List of converted examples (BertExample).
                or
-            (examples, hyps_refs): If infer==true, returns h 
+            (examples, hyps_refs): If infer==true, returns h
         """
 
         if not path.exists(input_filename):
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
index 7737bfa67f00..07ca790866c7 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py
@@ -45,14 +45,19 @@
 from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueGPTClassificationModel']
 
 
 class DialogueGPTClassificationModel(NLPModel):
     def __init__(
-        self, cfg: DictConfig, trainer: Trainer = None,
+        self,
+        cfg: DictConfig,
+        trainer: Trainer = None,
     ):
+        # deprecation warning
+        deprecated_warning("DialogueGPTClassificationModel")
 
         self.cfg = cfg
         self.eval_mode = cfg.dataset.eval_mode
@@ -101,14 +106,14 @@ def __init__(
 
     def setup_optimizer_param_groups(self):
         """
-        ModelPT override for prompt learning. 
-        Optimizer will get self._optimizer_param_groups. 
+        ModelPT override for prompt learning.
+        Optimizer will get self._optimizer_param_groups.
         Makes two optimizer param groups, one for the frozen model params
-        and one for the prompt-table/prompt-encoder params. The learning 
+        and one for the prompt-table/prompt-encoder params. The learning
         rate for the frozen model's params will always be zero effectively
         freezing the model's params but still allowing for the needed gradients
-        to be passed around in pipeline parallel models. The prompt-encoder 
-        and/or prompt table will use the learning rate set by the user. 
+        to be passed around in pipeline parallel models. The prompt-encoder
+        and/or prompt table will use the learning rate set by the user.
         """
         if not self.prompt_learning:
             super().setup_optimizer_param_groups()
@@ -328,7 +333,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True):
                 len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0
             )
             position_ids = torch.arange(
-                start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device,
+                start=0,
+                end=num_prompt_tokens + input_ids.size(1),
+                dtype=torch.long,
+                device=input_ids.device,
             )
 
             prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids)
@@ -708,7 +716,9 @@ def prepare_data(self):
             )
         elif self._cfg.dataset.task == 'design':
             self.dialogues_processor = DialogueDesignDataProcessor(
-                data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset,
+                data_dir=self._cfg.dataset.data_dir,
+                tokenizer=self.tokenizer,
+                cfg=self._cfg.dataset,
             )
         else:
             raise ValueError("Only sgd, assistant, zero_shot, design supported for Dialogue GPT Classification Model")
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
index 602c15a50c76..116605b65d52 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py
@@ -35,6 +35,7 @@
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueGPTGenerationModel']
 
@@ -43,8 +44,12 @@
 
 class DialogueGPTGenerationModel(NLPModel):
     def __init__(
-        self, cfg: DictConfig, trainer: Trainer = None,
+        self,
+        cfg: DictConfig,
+        trainer: Trainer = None,
     ):
+        # deprecation warning
+        deprecated_warning("DialogueGPTGenerationModel")
 
         self.cfg = cfg
         self.data_prepared = False
@@ -108,7 +113,10 @@ def eval_epoch_end(self, outputs, mode='val'):
         )
 
         DialogueGenerationMetrics.save_predictions(
-            filename, generated_field, ground_truth_field, inputs,
+            filename,
+            generated_field,
+            ground_truth_field,
+            inputs,
         )
 
         label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))])
@@ -155,7 +163,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True):
             )
 
             position_ids = torch.arange(
-                start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device,
+                start=0,
+                end=num_prompt_tokens + input_ids.size(1),
+                dtype=torch.long,
+                device=input_ids.device,
             )
 
             position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1)
@@ -228,7 +239,7 @@ def setup(self, stage=None):
 
     def prepare_megatron_generation(self, labels, input_ids, template_length):
         """
-        # adapted from MegatronGPTModel._bucketize_gpt_inference 
+        # adapted from MegatronGPTModel._bucketize_gpt_inference
         """
         batch_size = labels.size(0)
         prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_learning else None
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
index 455b0fa17a85..29e2627fa038 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py
@@ -34,14 +34,18 @@
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueNearestNeighbourModel']
 
 
 class DialogueNearestNeighbourModel(NLPModel):
-    """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions """
+    """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions"""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("DialogueNearestNeighbourModel")
+
         self.cfg = cfg
         super().__init__(cfg=cfg, trainer=trainer)
         if self.cfg.library == "huggingface":
@@ -155,7 +159,10 @@ def on_validation_epoch_end(self):
         filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl")
 
         DialogueGenerationMetrics.save_predictions(
-            filename, predicted_labels, ground_truth_labels, decoded_inputs,
+            filename,
+            predicted_labels,
+            ground_truth_labels,
+            decoded_inputs,
         )
 
         label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))}
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
index 9655fbea2722..73f09f62b1d5 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py
@@ -32,6 +32,7 @@
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
@@ -46,8 +47,12 @@
 
 class DialogueS2SGenerationModel(NLPModel):
     def __init__(
-        self, cfg: DictConfig, trainer: Trainer = None,
+        self,
+        cfg: DictConfig,
+        trainer: Trainer = None,
     ):
+        # deprecation warning
+        deprecated_warning("DialogueS2SGenerationModel")
 
         self.cfg = cfg
         self.data_prepared = False
@@ -120,7 +125,10 @@ def eval_epoch_end(self, outputs, mode='val'):
         )
 
         DialogueGenerationMetrics.save_predictions(
-            filename, generated_field, ground_truth_field, inputs,
+            filename,
+            generated_field,
+            ground_truth_field,
+            inputs,
         )
 
         label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))])
@@ -172,7 +180,7 @@ def forward(self, input_ids, attention_masks, labels):
 
     def prepare_megatron_generation(self, labels, input_ids, template_length):
         """
-        # adapted from MegatronGPTModel._bucketize_gpt_inference 
+        # adapted from MegatronGPTModel._bucketize_gpt_inference
         """
         batch_size = labels.size(0)
         prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_tags else None
diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
index 0e007a7bcdd1..5298c060df08 100644
--- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
+++ b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py
@@ -36,6 +36,7 @@
 from nemo.collections.nlp.models import TextClassificationModel
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['DialogueZeroShotIntentModel']
 
@@ -44,6 +45,9 @@ class DialogueZeroShotIntentModel(TextClassificationModel):
     """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition."""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("DialogueZeroShotIntentModel")
+
         self.cfg = cfg
         super().__init__(cfg=cfg, trainer=trainer)
 
@@ -275,7 +279,10 @@ def on_validation_epoch_end(self, split="val"):
         filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl")
 
         DialogueGenerationMetrics.save_predictions(
-            filename, predicted_labels, ground_truth_labels, utterances,
+            filename,
+            predicted_labels,
+            ground_truth_labels,
+            utterances,
         )
 
         label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))}
@@ -316,7 +323,6 @@ def predict(
         entailment_idx=1,
         contradiction_idx=0,
     ) -> List[Dict]:
-
         """
         Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query.
 
diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
index a34afa64674d..777d468084e2 100644
--- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
+++ b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py
@@ -35,12 +35,15 @@
 from nemo.core.classes import typecheck
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class IntentSlotClassificationModel(NLPModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
-        """ Initializes BERT Joint Intent and Slot model.
-        """
+        """Initializes BERT Joint Intent and Slot model."""
+        # deprecation warning
+        deprecated_warning("IntentSlotClassificationModel")
+
         self.max_seq_length = cfg.dataset.max_seq_length
         self.cfg = cfg
         # Check the presence of data_dir.
@@ -78,7 +81,7 @@ def _set_defaults_data_desc(self, cfg):
             OmegaConf.set_struct(cfg, True)
 
     def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
-        """ Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc. """
+        """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc."""
         # Save data from data desc to config - so it can be reused later, e.g. in inference.
         data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix])
         OmegaConf.set_struct(cfg, False)
@@ -112,7 +115,7 @@ def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds):
         OmegaConf.set_struct(cfg, True)
 
     def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
-        """ Saves label ids map to a file """
+        """Saves label ids map to a file"""
         with open(filename, 'w') as out:
             labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
             out.write('\n'.join(labels))
@@ -120,7 +123,7 @@ def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None:
             logging.info(f'Labels mapping saved to : {out.name}')
 
     def _reconfigure_classifier(self):
-        """ Method reconfigures the classifier depending on the settings of model cfg.data_desc """
+        """Method reconfigures the classifier depending on the settings of model cfg.data_desc"""
 
         self.classifier = SequenceTokenClassifier(
             hidden_size=self.hidden_size,
@@ -310,7 +313,7 @@ def get_utterance_tokens(self, token_ids, token_masks):
         Args:
             token_ids: IntTensor of size (max_seq_len, )
             token_masks: BoolTensor of size (max_seq_len, )
-        
+
         Returns
             token_list: List of Str (list of tokens with len <= max_seq_len)
         """
diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py
index b350fd01fa09..3b30dfccd9ce 100644
--- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py
+++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py
@@ -35,6 +35,7 @@
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['SGDQAModel']
 
@@ -44,7 +45,7 @@ class SGDQAModel(NLPModel):
     Dialogue State Tracking Model SGD-QA (https://arxiv.org/abs/2105.08049)
 
     The SGD-QA model is a fast multi-pass schema-guided state-tracking model, that is trained on the Google schema-guided state tracking dataset (https://arxiv.org/abs/1909.05855).
-    The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. 
+    The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs.
     The model consists of two components: a neural natural language understanding model (NLU), and a rule-based state tracker.
     The NLU takes in a dialogue turn and different schema (entity) information options and outputs their match score. The state tracker takes the highest rated entities and composes
     the dialogue state across turns.
@@ -55,6 +56,9 @@ def output_module(self):
         return self.decoder
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("SGDQAModel")
+
         self.data_prepared = False
         super().__init__(cfg=cfg, trainer=trainer)
         self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout)
@@ -146,7 +150,7 @@ def validation_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_
         Called at every validation step to aggregate and postprocess outputs on each GPU
         Args:
             batch: input batch at validation step
-            batch_idx: batch index 
+            batch_idx: batch index
             dataloader_idx: dataloader index
         """
         loss, tensors = self.eval_step_helper(batch=batch)
@@ -163,7 +167,7 @@ def test_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: i
         Called at every test step to aggregate and postprocess outputs on each GPU
         Args:
             batch: input batch at test step
-            batch_idx: batch index 
+            batch_idx: batch index
             dataloader_idx: dataloader index
         """
         loss, tensors = self.eval_step_helper(batch=batch)
@@ -318,8 +322,8 @@ def eval_step_helper(self, batch: List[torch.Tensor]):
             torch.zeros(total_scores.size(), device=total_scores.get_device(), dtype=total_scores.dtype),
             total_scores,
         )
-        max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens ** 2), axis=-1)
-        max_span_p = torch.max(total_scores.view(-1, max_num_tokens ** 2), axis=-1)[0]
+        max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens**2), axis=-1)
+        max_span_p = torch.max(total_scores.view(-1, max_num_tokens**2), axis=-1)[0]
 
         span_start_index = torch.floor_divide(max_span_index, max_num_tokens)
         span_end_index = torch.fmod(max_span_index, max_num_tokens)
@@ -415,7 +419,7 @@ def format_turn_id(ex_id_num):
 
         def combine_predictions_in_example(predictions: dict, batch_size: int):
             '''
-            Combines predicted values to a single example. 
+            Combines predicted values to a single example.
             Args:
                 predictions: predictions ordered by keys then batch
                 batch_size: batch size
diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
index f3ef3ccb87f9..4afae81e3893 100644
--- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
+++ b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py
@@ -26,6 +26,7 @@
 from nemo.core.classes.exportable import Exportable
 from nemo.core.neural_types import LogitsType, NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['EntityLinkingModel']
 
@@ -44,6 +45,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         """Initializes the SAP-BERT model for entity linking."""
 
+        # deprecation warning
+        deprecated_warning("EntityLinkingModel")
+
         # tokenizer needed before super().__init__() so dataset and loader can process data
         self._setup_tokenizer(cfg.tokenizer)
 
@@ -123,7 +127,7 @@ def on_validation_epoch_end(self):
         Args:
             outputs: list of individual outputs of each validation step.
         Returns:
-            
+
         """
         if self.validation_step_outputs:
             avg_loss = torch.stack(
diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
index 4a073e2ada1c..4447ebb89386 100644
--- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
+++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py
@@ -31,6 +31,7 @@
 from nemo.core.classes import typecheck
 from nemo.core.neural_types import NeuralType
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['GLUEModel']
 
@@ -78,6 +79,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         """
         Initializes model to use BERT model for GLUE tasks.
         """
+        # deprecation warning
+        deprecated_warning("GLUEModel")
 
         if cfg.task_name not in cfg.supported_tasks:
             raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}')
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index e7ae529fe4e2..67a4802d83f6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -14,7 +14,6 @@
 
 """BERT model."""
 
-import warnings
 from dataclasses import dataclass
 
 import torch
@@ -33,6 +32,7 @@
     parallel_lm_logits,
     scaled_init_method_normal,
 )
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.enums import AttnMaskType
@@ -142,7 +142,13 @@ def forward(self, hidden_states, word_embeddings_weight):
 
 
 def post_language_model_processing(
-    lm_output, pooled_output, lm_head, binary_head, lm_labels, logit_weights, fp16_lm_cross_entropy,
+    lm_output,
+    pooled_output,
+    lm_head,
+    binary_head,
+    lm_labels,
+    logit_weights,
+    fp16_lm_cross_entropy,
 ):
     # lm_logits: [s, b, vocab_size]
     lm_logits = lm_head(lm_output, logit_weights)
@@ -348,7 +354,10 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw
         if self.post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
 
-            self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,)
+            self.lm_head = MCoreBertLMHead(
+                self.config.hidden_size,
+                self.config,
+            )
 
             self.output_layer = tensor_parallel.ColumnParallelLinear(
                 self.config.hidden_size,
@@ -476,10 +485,9 @@ def __init__(
         sequence_parallel=False,
         position_embedding_type='learned_absolute',
     ):
-        warnings.warn(
-            "NeMoBertModel will be deprecated mid 2024. Use MCoreBertModelWrapperWithPostLNSupport instead.",
-            DeprecationWarning,
-        )
+        # deprecation warning
+        deprecated_warning("NeMoBertModel", "MCoreBertModelWrapperWithPostLNSupport")
+
         super(NeMoBertModel, self).__init__(config=config)
         self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
         self.add_binary_head = add_binary_head
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
index 19fafb796fd7..c572d94acd11 100755
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
@@ -24,6 +24,7 @@
     parallel_lm_logits,
     scaled_init_method_normal,
 )
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.enums import AttnMaskType
@@ -167,6 +168,9 @@ def __init__(
         seq_len_interpolation_factor=None,
         rotary_base=10000,
     ):
+        # deprecation warning
+        deprecated_warning("GPTModel", "McoreGPTModel")
+
         super(GPTModel, self).__init__(config=config, share_token_embeddings=share_embeddings_and_output_weights)
 
         self.parallel_output = parallel_output
@@ -250,7 +254,9 @@ def __init__(
 
         if self.share_embeddings_and_output_weights:
             self.initialize_word_embeddings(
-                init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size,
+                init_method=init_method_normal(init_method_std),
+                vocab_size=vocab_size,
+                hidden_size=hidden_size,
             )
 
     def set_input_tensor(self, input_tensor):
@@ -299,9 +305,11 @@ def forward(
             post_process_result = post_language_model_processing(
                 loss_lm_output,
                 loss_labels,
-                self.language_model.output_layer.weight
-                if not self.share_embeddings_and_output_weights
-                else self.word_embeddings_weight(),
+                (
+                    self.language_model.output_layer.weight
+                    if not self.share_embeddings_and_output_weights
+                    else self.word_embeddings_weight()
+                ),
                 get_key_value,
                 self.parallel_output,
                 forward_method_parallel_output,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index d151925635ab..f6ee4b20183c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -37,6 +37,7 @@
 from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler
 from nemo.utils import AppState, logging
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
@@ -82,6 +83,9 @@ class MegatronBasePromptLearningModel(MegatronBaseModel, TextGeneration):
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
+        # deprecation warning
+        deprecated_warning("MegatronBasePromptLearningModel")
+
         super().__init__(cfg, trainer)
         self.init_model(cfg, trainer)
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 5ee7a3fcf480..acfc22439a7d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -44,6 +44,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPSaveRestoreConnector
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging
+from nemo.utils.decorators import deprecated_warning
 
 try:
     from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
@@ -72,25 +73,28 @@
 
 class MegatronGPTPromptLearningModel(MegatronBasePromptLearningModel):
     """
-    Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. 
+    Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model.
 
     Prompt Tuning initalizes virtual prompt embeddings directly from a copy of
     certain token embeddings from the the pretrained GPT model's vocabulary
-    and directly tunes these embedding weights. The token embeddings used in 
-    initalization are specified by the user in the config file. The model can 
-    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a 
-    prompt table and can be added or deleted without disrupting virtual prompts 
-    for other tasks. 
+    and directly tunes these embedding weights. The token embeddings used in
+    initalization are specified by the user in the config file. The model can
+    be prompt-tuned for multiple tasks at once. virtual prompts are stored in a
+    prompt table and can be added or deleted without disrupting virtual prompts
+    for other tasks.
 
     P-tuning initializes an LSTM encoder model that generates virtual prompt
     embeddings for every task. Each task shares the same encoder. After ptuning
     is compelete, the learned virtual prompts can be saved to the prompt table
-    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a 
-    new virtual prompt via p-tuning, they do not need to retrain on all previous 
+    using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a
+    new virtual prompt via p-tuning, they do not need to retrain on all previous
     tasks. This gives p-tuning the same task flexiblity as prompt-tuning.
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
+        # deprecation warning
+        deprecated_warning("MegatronGPTPromptLearningModel")
+
         super().__init__(cfg, trainer)
 
         self.inference_params = None
@@ -305,8 +309,8 @@ def forward(
 
     def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
         """
-            Dataloader produces a global batch which is turned into an iterator of microbatches.
-            The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions.
+        Dataloader produces a global batch which is turned into an iterator of microbatches.
+        The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions.
         """
         # Get seq length of batch
         batch, _, _ = next(dataloader_iter)
@@ -361,15 +365,15 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core.
+        No need to call it here.
         """
         return
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         return
 
@@ -415,11 +419,19 @@ def validation_step(self, dataloader_iter):
                 labels_text.append(label)
             if mode == 'val':
                 self.validation_step_outputs.append(
-                    {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,}
+                    {
+                        'loss': loss_mean,
+                        'preds': preds_text,
+                        'labels': labels_text,
+                    }
                 )
             else:
                 self.test_step_outputs.append(
-                    {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,}
+                    {
+                        'loss': loss_mean,
+                        'preds': preds_text,
+                        'labels': labels_text,
+                    }
                 )
             return {
                 'loss': loss_mean,
@@ -427,8 +439,10 @@ def validation_step(self, dataloader_iter):
                 'labels': labels_text,
             }
 
-        self.validation_step_outputs.append({'loss': loss_mean}) if mode == 'val' else self.test_step_outputs.append(
-            {'loss': loss_mean}
+        (
+            self.validation_step_outputs.append({'loss': loss_mean})
+            if mode == 'val'
+            else self.test_step_outputs.append({'loss': loss_mean})
         )
         return {'loss': loss_mean}
 
@@ -481,7 +495,8 @@ def on_validation_epoch_end(self):
                 gather_results_dedup = list(set(itertools.chain(*gather_results)))
 
                 val_metric_dict = self.validation_metric.get_score(
-                    [i[1] for i in gather_results_dedup], [i[0] for i in gather_results_dedup],
+                    [i[1] for i in gather_results_dedup],
+                    [i[0] for i in gather_results_dedup],
                 )
 
                 for metric, val in val_metric_dict.items():
@@ -638,9 +653,9 @@ def build_virtual_prompt_dataset(
             drop_last=drop_last,
             num_workers=num_workers,
             pin_memory=pin_memory,
-            persistent_workers=True
-            if num_workers > 0
-            else False,  # (@adithyare and @eharper) We need this to make spawn=True to work.
+            persistent_workers=(
+                True if num_workers > 0 else False
+            ),  # (@adithyare and @eharper) We need this to make spawn=True to work.
         )
 
         return dataset, dataloader
@@ -815,7 +830,7 @@ def list_available_models(cls):
 def get_pseudo_tokens(num_virtual_tokens):
     """
     Takes in an integer and returns a list of strings where each string
-    is a numbered virtual token placeholder. If 
+    is a numbered virtual token placeholder. If
     num_virtual_tokens = 3, then this function returns:
 
     ["<prompt_0>", "<prompt_1>", "<prompt_2>"]
@@ -823,7 +838,7 @@ def get_pseudo_tokens(num_virtual_tokens):
     Args:
         num_virtual_tokens: (int) Number of virtual token strings you want to make
 
-    returns a list of string. 
+    returns a list of string.
 
     """
     pseudo_tokens = [
diff --git a/nemo/collections/nlp/models/question_answering/qa_base_model.py b/nemo/collections/nlp/models/question_answering/qa_base_model.py
index bfb45f51b6ac..7ca78f2e136e 100644
--- a/nemo/collections/nlp/models/question_answering/qa_base_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_base_model.py
@@ -25,10 +25,14 @@
 )
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class BaseQAModel(NLPModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=True):
+        # deprecation warning
+        deprecated_warning("BaseQAModel")
+
         self.cfg = cfg
         super().__init__(cfg=cfg, trainer=trainer, no_lm_init=no_lm_init)
 
@@ -82,10 +86,13 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
 
     @torch.no_grad()
     def _get_per_sample_perplexity(self, logits, labels):
-        """ Returns average perplexity for each sample in the batch  """
+        """Returns average perplexity for each sample in the batch"""
 
         loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
-        unreduced_loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1),)
+        unreduced_loss = loss_fct(
+            logits.view(-1, logits.size(-1)),
+            labels.view(-1),
+        )
         unreduced_loss = unreduced_loss.reshape(labels.shape)
         mask_0 = unreduced_loss != 0
         per_sample_perplexity = torch.exp((unreduced_loss * mask_0).sum(axis=1) / mask_0.sum(axis=1))
diff --git a/nemo/collections/nlp/models/question_answering/qa_bert_model.py b/nemo/collections/nlp/models/question_answering/qa_bert_model.py
index 196fab4e3a04..d4bdef6d871d 100644
--- a/nemo/collections/nlp/models/question_answering/qa_bert_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_bert_model.py
@@ -31,12 +31,15 @@
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class BERTQAModel(BaseQAModel):
-    """ BERT model with a QA (token classification) head """
+    """BERT model with a QA (token classification) head"""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("BERTQAModel")
 
         super().__init__(cfg=cfg, trainer=trainer, no_lm_init=False)
         self.classifier = TokenClassifier(
@@ -190,7 +193,7 @@ def inference(
             num_samples: number of samples to use of inference data. Default: -1 if all data should be used.
             output_nbest_file: optional output file for writing out nbest list
             output_prediction_file: optional output file for writing out predictions
-            
+
         Returns:
             model predictions, model nbest list
         """
@@ -209,7 +212,10 @@ def inference(
             logging.set_verbosity(logging.WARNING)
 
             infer_datalayer = self.setup_inference_data(
-                file, batch_size=batch_size, num_samples=num_samples, num_workers=2,
+                file,
+                batch_size=batch_size,
+                num_samples=num_samples,
+                num_workers=2,
             )
 
             all_logits = []
@@ -244,7 +250,9 @@ def inference(
 
             if output_prediction_file:
                 QAMetrics.dump_predicted_answers_to_file(
-                    output_prediction_file, infer_datalayer.dataset.examples, all_predictions,
+                    output_prediction_file,
+                    infer_datalayer.dataset.examples,
+                    all_predictions,
                 )
 
             if output_nbest_file:
@@ -324,7 +332,7 @@ def get_predictions(
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
         scores_diff_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(examples):
+        for example_index, example in enumerate(examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -349,7 +357,7 @@ def get_predictions(
             null_start_logit = 0
             # end logit at the slice with min null score
             null_end_logit = 0
-            for (feature_index, feature) in enumerate(curr_features):
+            for feature_index, feature in enumerate(curr_features):
                 pos = unique_id_to_pos[feature.unique_id]
                 start_indexes = self._get_best_indexes(start_logits[pos], n_best_size)
                 end_indexes = self._get_best_indexes(end_logits[pos], n_best_size)
@@ -468,7 +476,7 @@ def get_predictions(
             probs = _compute_softmax(total_scores)
 
             nbest_json = []
-            for (i, entry) in enumerate(nbest):
+            for i, entry in enumerate(nbest):
                 output = collections.OrderedDict()
                 output["question"] = example.question_text
                 output["text"] = entry.text
@@ -531,7 +539,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
         return data_loader
 
     def _get_best_indexes(self, logits, n_best_size):
-        """ Get the n-best logits from a list """
+        """Get the n-best logits from a list"""
 
         best_indices = np.argsort(logits)[::-1]
 
@@ -570,7 +578,7 @@ def _get_final_text(self, pred_text: str, orig_text: str, do_lower_case: bool, v
         def _strip_spaces(text):
             ns_chars = []
             ns_to_s_map = collections.OrderedDict()
-            for (i, c) in enumerate(text):
+            for i, c in enumerate(text):
                 if c == " ":
                     continue
                 ns_to_s_map[len(ns_chars)] = i
@@ -599,14 +607,16 @@ def _strip_spaces(text):
         if len(orig_ns_text) != len(tok_ns_text):
             if verbose_logging:
                 logging.warning(
-                    "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
+                    "Length not equal after stripping spaces: '%s' vs '%s'",
+                    orig_ns_text,
+                    tok_ns_text,
                 )
             return orig_text
 
         # We then project the characters in `pred_text` back to `orig_text` using
         # the character-to-character alignment.
         tok_s_to_ns_map = {}
-        for (i, tok_index) in tok_ns_to_s_map.items():
+        for i, tok_index in tok_ns_to_s_map.items():
             tok_s_to_ns_map[tok_index] = i
 
         orig_start_position = None
diff --git a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
index 405b9a1e05ad..059cf5625f15 100644
--- a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py
@@ -27,10 +27,14 @@
 from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class GPTQAModel(BaseQAModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("GPTQAModel")
+
         self.cfg = cfg
 
         self.setup_tokenizer(cfg.tokenizer)
@@ -102,7 +106,11 @@ def on_validation_epoch_end(self):
 
         eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset
         eval_results, _, _ = self.evaluate(
-            eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers,
+            eval_dataset.features,
+            eval_dataset.examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_answers,
         )
 
         self.log(f'{prefix}_loss', avg_loss)
@@ -185,10 +193,19 @@ def inference(
         return all_predictions, all_nbest_perdictions
 
     def evaluate(
-        self, features, examples, unique_ids, per_sample_perplexity, generated_texts,
+        self,
+        features,
+        examples,
+        unique_ids,
+        per_sample_perplexity,
+        generated_texts,
     ):
         all_predictions, all_nbest_predictions = self._get_predictions(
-            features, examples, unique_ids, per_sample_perplexity, generated_texts,
+            features,
+            examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_texts,
         )
 
         eval_results = QAMetrics.evaluate_predictions(examples, all_predictions)
@@ -226,7 +243,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
         return data_loader
 
     def _get_predictions(
-        self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List,
+        self,
+        features,
+        examples: List,
+        unique_ids: List[int],
+        per_sample_perplexity: List,
+        generated_texts: List,
     ):
         unique_id_to_pos = {}
         for index, unique_id in enumerate(unique_ids):
@@ -242,7 +264,7 @@ def _get_predictions(
 
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(examples):
+        for example_index, example in enumerate(examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -250,7 +272,7 @@ def _get_predictions(
 
             curr_features = example_index_to_features[example_index]
             prelim_predictions = []
-            for (feature_index, feature) in enumerate(curr_features):
+            for feature_index, feature in enumerate(curr_features):
                 pos = unique_id_to_pos[feature.unique_id]
                 curr_perplexity = per_sample_perplexity[pos]
                 curr_generated_text = generated_texts[pos]
diff --git a/nemo/collections/nlp/models/question_answering/qa_model.py b/nemo/collections/nlp/models/question_answering/qa_model.py
index 6fb2054a2237..2147d7d6a5bf 100644
--- a/nemo/collections/nlp/models/question_answering/qa_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_model.py
@@ -32,6 +32,7 @@
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 __all__ = ['QAModel']
 
@@ -42,6 +43,9 @@ class QAModel(NLPModel):
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("QAModel")
+
         super().__init__(cfg=cfg, trainer=trainer)
         self.classifier = TokenClassifier(
             hidden_size=self.hidden_size,
@@ -186,7 +190,7 @@ def inference(
             num_samples: number of samples to use of inference data. Default: -1 if all data should be used.
             output_nbest_file: optional output file for writing out nbest list
             output_prediction_file: optional output file for writing out predictions
-            
+
         Returns:
             model predictions, model nbest list
         """
diff --git a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
index 81001fb66da7..5ad959fd1b6f 100644
--- a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
+++ b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py
@@ -28,10 +28,13 @@
 from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.utils import logging
+from nemo.utils.decorators import deprecated_warning
 
 
 class S2SQAModel(BaseQAModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # deprecation warning
+        deprecated_warning("S2SQAModel")
 
         self.cfg = cfg
 
@@ -120,7 +123,11 @@ def on_validation_epoch_end(self):
 
         eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset
         eval_results, _, _ = self.evaluate(
-            eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers,
+            eval_dataset.features,
+            eval_dataset.examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_answers,
         )
 
         self.log(f'{prefix}_loss', avg_loss)
@@ -145,7 +152,11 @@ def forward(self, input_ids, input_attn_mask, labels):
             labels = torch.where(labels != -100, labels, torch.zeros_like(labels))
             output_attn_masks = torch.where(labels > 0, torch.ones_like(labels), torch.zeros_like(labels))
             unmasked_unreduced_loss = self.language_model(
-                input_ids, labels[:, :-1], input_attn_mask, output_attn_masks[:, :-1], lm_labels=labels[:, 1:],
+                input_ids,
+                labels[:, :-1],
+                input_attn_mask,
+                output_attn_masks[:, :-1],
+                lm_labels=labels[:, 1:],
             )
             loss = self.language_model.loss_func(output_attn_masks[:, 1:], unmasked_unreduced_loss)
             per_sample_perplexity = torch.exp(unmasked_unreduced_loss)
@@ -210,10 +221,19 @@ def inference(
         return all_predictions, all_nbest_predictions
 
     def evaluate(
-        self, features, examples, unique_ids, per_sample_perplexity, generated_texts,
+        self,
+        features,
+        examples,
+        unique_ids,
+        per_sample_perplexity,
+        generated_texts,
     ):
         all_predictions, all_nbest_json = self._get_predictions(
-            features, examples, unique_ids, per_sample_perplexity, generated_texts,
+            features,
+            examples,
+            unique_ids,
+            per_sample_perplexity,
+            generated_texts,
         )
 
         eval_results = QAMetrics.evaluate_predictions(examples, all_predictions)
@@ -251,7 +271,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
         return data_loader
 
     def _get_predictions(
-        self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List,
+        self,
+        features,
+        examples: List,
+        unique_ids: List[int],
+        per_sample_perplexity: List,
+        generated_texts: List,
     ):
 
         unique_id_to_pos = {}
@@ -268,7 +293,7 @@ def _get_predictions(
 
         all_predictions = collections.OrderedDict()
         all_nbest_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(examples):
+        for example_index, example in enumerate(examples):
 
             # finish this loop if we went through all batch examples
             if example_index >= len(unique_ids):
@@ -276,7 +301,7 @@ def _get_predictions(
 
             curr_features = example_index_to_features[example_index]
             prelim_predictions = []
-            for (feature_index, feature) in enumerate(curr_features):
+            for feature_index, feature in enumerate(curr_features):
                 pos = unique_id_to_pos[feature.unique_id]
                 curr_perplexity = per_sample_perplexity[pos]
                 curr_generated_text = generated_texts[pos]
@@ -339,7 +364,10 @@ def _generate_candidates(self, input_ids, input_attn_mask):
                 "max_length": num_tokens_to_generate,
             }
             generated_tokens = self.language_model.generate(**param_dict)
-            generated_answers = self.tokenizer.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True,)
+            generated_answers = self.tokenizer.tokenizer.batch_decode(
+                generated_tokens,
+                skip_special_tokens=True,
+            )
             generated_answers = [ans.strip() for ans in generated_answers]
 
         elif self.cfg.library == 'megatron':
diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
index eed94f2e1e31..d9e08f6764fc 100644
--- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
+++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py
@@ -35,7 +35,7 @@
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.core.neural_types import LogitsType, NeuralType
 from nemo.utils import logging
-from nemo.utils.decorators import experimental
+from nemo.utils.decorators import deprecated_warning, experimental
 
 __all__ = ["SpellcheckingAsrCustomizationModel"]
 
@@ -48,7 +48,7 @@ class SpellcheckingAsrCustomizationModel(NLPModel):
     It takes as input ASR hypothesis and candidate customization entries.
     It labels the hypothesis with correct entry index or 0.
     Example input:   [CLS] a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o [SEP] d i d i e r _ s a u m o n [SEP] a s t r o n o m i e [SEP] t r i s t a n _ g u i l l o t [SEP] ...
-    Input segments:      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1 1 1 1 1 1 1     2 2 2 2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3     4      
+    Input segments:      0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1 1 1 1 1 1 1     2 2 2 2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3     4
     Example output:      0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0     ...
     """
 
@@ -67,6 +67,9 @@ def output_module(self):
         return self
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None:
+        # deprecation warning
+        deprecated_warning("SpellcheckingAsrCustomizationModel")
+
         super().__init__(cfg=cfg, trainer=trainer)
 
         # Label map contains 11 labels: 0 for nothing, 1..10 for target candidate ids
@@ -321,7 +324,7 @@ def on_test_epoch_end(self):
 
     @torch.no_grad()
     def infer(self, dataloader_cfg: DictConfig, input_name: str, output_name: str) -> None:
-        """ Main function for Inference
+        """Main function for Inference
 
         Args:
             dataloader_cfg: config for dataloader
@@ -517,7 +520,7 @@ def _setup_infer_dataloader(self, cfg: DictConfig, input_name: str) -> 'torch.ut
         Setup function for a infer data loader.
         Args:
             cfg: config dictionary containing data loader params like batch_size, num_workers and pin_memory
-            input_name: path to input file. 
+            input_name: path to input file.
         Returns:
             A pytorch DataLoader.
         """
diff --git a/nemo/utils/decorators/__init__.py b/nemo/utils/decorators/__init__.py
index 4468a3bc09b5..2cfec9e40d64 100644
--- a/nemo/utils/decorators/__init__.py
+++ b/nemo/utils/decorators/__init__.py
@@ -13,6 +13,6 @@
 # limitations under the License.
 
 
-from nemo.utils.decorators.deprecated import deprecated
+from nemo.utils.decorators.deprecated import deprecated, deprecated_warning
 from nemo.utils.decorators.experimental import experimental
 from nemo.utils.decorators.port_docs import add_port_docs
diff --git a/nemo/utils/decorators/deprecated.py b/nemo/utils/decorators/deprecated.py
index 65f92e62563e..40957bb343d4 100644
--- a/nemo/utils/decorators/deprecated.py
+++ b/nemo/utils/decorators/deprecated.py
@@ -30,14 +30,14 @@
 
 def deprecated(wrapped=None, version=None, explanation=None, wait_seconds=0):
     """
-        Decorator which can be used for indicating that a function/class is deprecated and going to be removed.
-        Tracks down which function/class printed the warning and will print it only once per call.
-
-        Args:
-          version: Version in which the function/class will be removed (optional).
-          explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional).
-          wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned
-          with subsequent logging messages.
+    Decorator which can be used for indicating that a function/class is deprecated and going to be removed.
+    Tracks down which function/class printed the warning and will print it only once per call.
+
+    Args:
+      version: Version in which the function/class will be removed (optional).
+      explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional).
+      wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned
+      with subsequent logging messages.
     """
 
     if wrapped is None:
@@ -71,3 +71,26 @@ def wrapper(wrapped, instance, args, kwargs):
         return wrapped(*args, **kwargs)
 
     return wrapper(wrapped)
+
+
+def deprecated_warning(old_method=None, new_method=None, wait_seconds=2):
+    """
+    Function which can be used for indicating that a function/class is deprecated and going to be removed.
+
+    Args:
+      old_method: Name of deprecated class/function.
+      new_method: Name of new class/function to use.
+      wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned
+      with subsequent logging messages.
+    """
+
+    # Create a banner
+    if new_method is not None:
+        msg = f"*****  {old_method} is deprecated. Please, use {new_method} instead.  *****"
+    else:
+        msg = f"*****  {old_method} is deprecated and will be removed soon.  *****"
+    banner = '\n'.join(['*' * len(msg)] * 2 + [msg] + ['*' * len(msg)] * 2)
+
+    logging.warning(f"\n\n{banner}\n")
+    logging.warning(f"Waiting for {wait_seconds} seconds before this message disappears.")
+    time.sleep(wait_seconds)
diff --git a/tests/collections/nlp/test_dialogue.py b/tests/collections/nlp/test_dialogue.py
deleted file mode 100644
index 9c227f737d98..000000000000
--- a/tests/collections/nlp/test_dialogue.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-
-from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor
-from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import (
-    DialogueGPTClassificationDataset,
-)
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_s2s_generation_dataset import DialogueS2SGenerationDataset
-from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset
-from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics, DialogueGenerationMetrics
-from nemo.collections.nlp.models.dialogue.dialogue_nearest_neighbour_model import DialogueNearestNeighbourModel
-
-
-@pytest.mark.unit
-def test_dialogue_metric_generation_f1():
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    precision, recall, f1 = DialogueGenerationMetrics._get_one_f1(generated_field, ground_truth_field)
-    assert precision == 75
-    assert recall == 75
-    assert f1 == 75
-
-
-@pytest.mark.unit
-def test_dialogue_metric_split_label_and_slots():
-    fields = ["reserve_restaurant\nslots: time_of_day(7pm), number_of_people(3)", "time_of_day(7pm)"]
-    labels, slots_list = DialogueClassificationMetrics.split_label_and_slots(fields, with_slots=True)
-    assert labels == ["reserve_restaurant", 'none']
-    assert slots_list == [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]]
-
-
-@pytest.mark.unit
-def test_dialogue_metric_slot_filling_metrics():
-    generated_slots = [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]]
-    ground_truth_slots = [["time_of_day(7pm)"], ["time_of_day(7pm)", "number_of_people(3)"]]
-
-    (
-        avg_precision,
-        avg_recall,
-        avg_f1,
-        avg_joint_goal_accuracy,
-    ) = DialogueClassificationMetrics.get_slot_filling_metrics(generated_slots, ground_truth_slots)
-
-    assert avg_precision == 75
-    assert avg_recall == 75
-    assert avg_f1 == 75
-    assert avg_joint_goal_accuracy == 0
-
-
-@pytest.mark.unit
-def test_dialogue_assistant_data_processor_normalize_zero_shot_intent():
-    label0 = 'food_ordering.contextual_query'
-    normalized_label0 = 'contextual query'
-
-    label1 = 'food_ordering.nomatch'
-    normalized_label1 = 'no match'
-
-    label2 = 'food_ordering.no'
-    normalized_label2 = 'no'
-
-    assert normalized_label0 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label0)
-    assert normalized_label1 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label1)
-    assert normalized_label2 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label2)
-
-
-@pytest.mark.unit
-def test_dialogue_assistant_data_processor_get_continuous_slots():
-    slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54]
-    empty_slot_id = 54
-    bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54}
-    continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots(
-        slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids
-    )
-    assert continuous_slots == {19: [3, 5], 18: [5, 6]}
-
-    # here 18 and 19 maps to the same slot (originally variants of B-slot and I-slot)
-    slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54]
-    empty_slot_id = 54
-    bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 18, 54: 54}
-    continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots(
-        slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids
-    )
-    assert continuous_slots == {18: [3, 6]}
-
-    # test if function works when non-empty slots are at boundary
-    slot_ids = [18, 54, 54, 19, 19]
-    empty_slot_id = 54
-    bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54}
-    continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots(
-        slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids
-    )
-    assert continuous_slots == {18: [0, 1], 19: [3, 5]}
-
-
-@pytest.mark.unit
-def test_dialogue_assistant_map_bio_format_slots_to_unified_slots():
-
-    slots = ['B-time', 'I-time', 'B-alarm', 'I-alarm', 'O']
-    gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '0', '2': '1', '3': '1', '4': '2'}
-    gt_unified_slots = ['time', 'alarm', 'O']
-    (
-        bio_slot_ids_to_unified_slot_ids,
-        unified_slots,
-    ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots)
-    assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids
-    assert gt_unified_slots == unified_slots
-
-    # case in which BIOS scheme was not used in annotation
-    slots = ['time', 'alarm', 'O']
-    gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '1', '2': '2'}
-    gt_unified_slots = ['time', 'alarm', 'O']
-    (
-        bio_slot_ids_to_unified_slot_ids,
-        unified_slots,
-    ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots)
-
-    assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids
-    assert gt_unified_slots == unified_slots
-
-
-@pytest.mark.unit
-def test_dialogue_data_processor_get_relevant_idxs():
-
-    dataset_split = 'train'
-    dev_proportion = 10
-    n_samples = 1000
-    idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion)
-
-    assert len(idxs) == 900
-    assert idxs != list(range(900))
-
-    dataset_split = 'dev'
-    dev_proportion = 40
-    n_samples = 1000
-    idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion)
-
-    assert len(idxs) == 400
-    assert idxs != list(range(400))
-
-    dataset_split = 'test'
-    dev_proportion = 40
-    n_samples = 1000
-    idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion)
-
-    assert len(idxs) == 1000
-    assert idxs == list(range(1000))
-
-
-@pytest.mark.unit
-def test_dialogue_sgd_data_processor_convert_camelcase_to_lower():
-    label = 'none'
-    gt_converted_label = 'none'
-
-    assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label)
-
-    label = 'ReserveRestaurant'
-    gt_converted_label = 'reserve restaurant'
-
-    assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label)
-
-    label = 'Alarm'
-    gt_converted_label = 'alarm'
-
-    assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label)
-
-
-@pytest.mark.unit
-def test_dialogue_gpt_classification_dataset_linearize_slots():
-
-    slots = []
-    linearized_slots = 'None'
-    assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots)
-
-    slots = {'time': '7pm', 'place': 'field'}
-    linearized_slots = 'time(7pm), place(field)'
-    assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots)
-
-    slots = {'time': ['7pm', '1900'], 'place': 'field'}
-    linearized_slots = 'time(7pm), place(field)'
-    assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots)
-
-
-@pytest.mark.unit
-def test_dialogue_gpt_classification_dataset_linearize_slots():
-
-    actions = [
-        {'act': 'inform', 'slot': 'time', 'values': ['7pm', '1900']},
-        {'act': 'confirm', 'slot': 'place', 'values': ['hall']},
-    ]
-
-    prompt_template = 'values'
-    formatted_actions = '7pm hall'
-    assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions)
-
-    prompt_template = 'slots_values'
-    formatted_actions = 'time (7pm) place (hall)'
-    assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions)
-
-    prompt_template = 'acts_slots_values'
-    formatted_actions = 'inform time (7pm) confirm place (hall)'
-    assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions)
-
-
-@pytest.mark.unit
-def test_dialogue_sgd_dataset_naive_tokenize():
-
-    utterance = 'I am feeling hungry so I would like to find a place to eat.'
-    tokens = [
-        'I',
-        ' ',
-        'am',
-        ' ',
-        'feeling',
-        ' ',
-        'hungry',
-        ' ',
-        'so',
-        ' ',
-        'I',
-        ' ',
-        'would',
-        ' ',
-        'like',
-        ' ',
-        'to',
-        ' ',
-        'find',
-        ' ',
-        'a',
-        ' ',
-        'place',
-        ' ',
-        'to',
-        ' ',
-        'eat',
-        '.',
-    ]
-    assert tokens == DialogueSGDBERTDataset._naive_tokenize(utterance)
-
-
-@pytest.mark.unit
-def test_dialogue_nearest_neighbour_mean_pooling():
-
-    model_output = [torch.ones(8, 512, 768)]
-    attention_mask = torch.ones(8, 512)
-    assert torch.equal(
-        torch.ones(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask)
-    )
-
-    model_output = [torch.zeros(8, 512, 768)]
-    attention_mask = torch.ones(8, 512)
-    assert torch.equal(
-        torch.zeros(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask)
-    )
-
-    model_output = [torch.cat([torch.zeros(8, 256, 768), torch.ones(8, 256, 768)], axis=1)]
-    attention_mask = torch.ones(8, 512)
-    assert torch.equal(
-        torch.ones(8, 768).float() * 0.5, DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask)
-    )
diff --git a/tests/collections/nlp/test_entity_linking_model.py b/tests/collections/nlp/test_entity_linking_model.py
deleted file mode 100644
index 16b768184296..000000000000
--- a/tests/collections/nlp/test_entity_linking_model.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import tempfile
-
-import pytest
-import wget
-from omegaconf import OmegaConf
-
-from nemo.collections.nlp.models import EntityLinkingModel
-
-
-def get_cfg():
-
-    language_model = OmegaConf.create(
-        {"pretrained_model_name": "bert-base-uncased", "config_file": None, "config": None, "lm_checkpoint": None}
-    )
-
-    tokenizer = OmegaConf.create(
-        {"tokenizer_name": "bert-base-uncased", "vocab_file": None, "tokenizer_model": None, "do_lower_case": True}
-    )
-
-    model = OmegaConf.create(
-        {
-            "nemo_path": "sap_entity_linking.nemo",
-            "max_seq_length": 128,
-            "language_model": language_model,
-            "tokenizer": tokenizer,
-            "train_ds": None,
-            "validation_ds": None,
-        }
-    )
-
-    cfg = OmegaConf.create({"model": model})
-
-    return cfg
-
-
-class TestEntityLinkingModel:
-    @pytest.mark.with_downloads()
-    @pytest.mark.unit
-    def test_creation_saving_restoring(self):
-        # Create a new temporary directory
-        with tempfile.TemporaryDirectory() as restore_dir:
-            with tempfile.TemporaryDirectory() as save_dir:
-                model = EntityLinkingModel(cfg=get_cfg().model)
-                assert isinstance(model, EntityLinkingModel)
-
-                save_dir_path = save_dir
-
-                # Where model will be saved
-                model_save_path = os.path.join(save_dir, f"{model.__class__.__name__}.nemo")
-                model.save_to(save_path=model_save_path)
-
-                # Where model will be restored from
-                model_restore_path = os.path.join(restore_dir, f"{model.__class__.__name__}.nemo")
-                shutil.copy(model_save_path, model_restore_path)
-
-            # at this point save_dir should not exist
-            assert save_dir_path is not None and not os.path.exists(save_dir_path)
-            assert not os.path.exists(model_save_path)
-            assert os.path.exists(model_restore_path)
-
-            # attempt to restore
-            model_copy = model.__class__.restore_from(restore_path=model_restore_path)
-            assert model.num_weights == model_copy.num_weights
-
-
-if __name__ == "__main__":
-    t = TestEntityLinkingModel()
-    t.test_creation_saving_restoring()
diff --git a/tests/collections/nlp/test_megatron.py b/tests/collections/nlp/test_megatron.py
deleted file mode 100644
index 8206457ec6ee..000000000000
--- a/tests/collections/nlp/test_megatron.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-try:
-    import apex
-
-    apex_available = True
-except Exception:
-    apex_available = False
-
-import os
-import tempfile
-
-import onnx
-import pytest
-import torch
-from omegaconf import OmegaConf
-
-import nemo.collections.nlp as nemo_nlp
-from nemo.core.classes import typecheck
-
-
-def get_pretrained_bert_345m_uncased_model():
-    model_name = "megatron-bert-345m-uncased"
-    config = {"language_model": {"pretrained_model_name": model_name}, "tokenizer": {}}
-    omega_conf = OmegaConf.create(config)
-    model = nemo_nlp.modules.get_lm_model(cfg=omega_conf)
-    if torch.cuda.is_available():
-        model = model.cuda()
-    return model
-
-
-class TestMegatron:
-    @pytest.mark.skip("This test was written for megatron-lm")
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    def test_list_pretrained_models(self):
-        pretrained_lm_models = nemo_nlp.modules.get_pretrained_lm_models_list()
-        assert len(pretrained_lm_models) > 0
-
-    @pytest.mark.with_downloads()
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    @pytest.mark.skip("Only one Megatron model is allowed")
-    def test_get_model(self):
-        model = get_pretrained_bert_345m_uncased_model()
-        assert isinstance(model, nemo_nlp.modules.MegatronBertEncoder)
-
-        typecheck.set_typecheck_enabled(enabled=False)
-        inp = model.input_example()
-        out = model.forward(*inp)
-        typecheck.set_typecheck_enabled(enabled=True)
-
-    @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine')
-    @pytest.mark.with_downloads()
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    @pytest.mark.skip("Megatron-LM BERT support deprecated. Supported in NeMo < 1.5")
-    def test_onnx_export(self):
-        model = get_pretrained_bert_345m_uncased_model()
-        assert model
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Generate filename in the temporary directory.
-            # Test export.
-            model.export(os.path.join(".", "megatron.onnx"))
-
-
-if __name__ == "__main__":
-    t = TestMegatron()
-    t.test_onnx_export()
diff --git a/tests/collections/nlp/test_mem_map_dataset.py b/tests/collections/nlp/test_mem_map_dataset.py
deleted file mode 100644
index 20932b6c4e0d..000000000000
--- a/tests/collections/nlp/test_mem_map_dataset.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import csv
-import json
-import os
-
-import pytest
-
-from nemo.collections.nlp.data.language_modeling import text_memmap_dataset
-
-
-@pytest.fixture
-def jsonl_file(tmp_path):
-    # Create a temporary file path
-    file_path = tmp_path / "data.jsonl"
-
-    # Generate data to write to the JSONL file
-    data = [
-        {"name": "John", "age": 30},
-        {"name": "Jane", "age": 25},
-        {"name": "Bob", "age": 35},
-    ]
-
-    # Write data to the JSONL file
-    with open(file_path, mode="w") as file:
-        for item in data:
-            json.dump(item, file)
-            file.write("\n")
-
-    # Provide the file path to the test function
-    yield str(file_path)
-
-    # Optional: Clean up the temporary file after the test
-    file_path.unlink()
-
-
-@pytest.fixture
-def csv_file(tmp_path):
-    # Create a temporary file path
-    file_path = tmp_path / "data.csv"
-
-    # Generate data to write to the CSV file
-    data = [["ID", "Name"], [1, "John"], [2, "Jane"], [3, "Bob"]]
-
-    # Write data to the CSV file
-    with open(file_path, mode="w", newline="") as file:
-        writer = csv.writer(file)
-        writer.writerows(data)
-
-    # Provide the file path to the test function
-    yield str(file_path)
-
-    # Optional: Clean up the temporary file after the test
-    file_path.unlink()
-
-
-def test_jsonl_mem_map_dataset(jsonl_file):
-    """Test for JSONL memory-mapped datasets."""
-
-    indexed_dataset = text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0)
-    assert indexed_dataset[0] == {"name": "John", "age": 30}
-    assert indexed_dataset[1] == {"name": "Jane", "age": 25}
-    assert indexed_dataset[2] == {"name": "Bob", "age": 35}
-
-
-def test_csv_mem_map_dataset(csv_file):
-    """Test for CSV memory-mapped datasets."""
-
-    indexed_dataset = text_memmap_dataset.CSVMemMapDataset(dataset_paths=[csv_file], data_col=1, header_lines=1)
-    assert indexed_dataset[0].strip() == "John"
-    assert indexed_dataset[1].strip() == "Jane"
-    assert indexed_dataset[2].strip() == "Bob"
-
-
-def test_csv_fields_mem_map_dataset(csv_file):
-    """Test for CSV memory-mapped datasets."""
-
-    indexed_dataset = text_memmap_dataset.CSVFieldsMemmapDataset(
-        dataset_paths=[csv_file], data_fields={"ID": 0, "Name": 1}, header_lines=1
-    )
-    assert isinstance(indexed_dataset[0], dict)
-    assert sorted(indexed_dataset[0].keys()) == ["ID", "Name"]
-    assert indexed_dataset[0]["ID"] == "1" and indexed_dataset[1]["ID"] == "2" and indexed_dataset[2]["ID"] == "3"
-    assert (
-        indexed_dataset[0]["Name"].strip() == "John"
-        and indexed_dataset[1]["Name"].strip() == "Jane"
-        and indexed_dataset[2]["Name"].strip() == "Bob"
-    )
-
-
-@pytest.mark.parametrize(
-    "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset],
-)
-@pytest.mark.parametrize("use_alternative_index_mapping_dir", [True, False])
-@pytest.mark.parametrize("relative_index_fn", [True, False])
-def test_mem_map_dataset_index_mapping_dir(
-    tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn,
-):
-    """Test for index_mapping_dir."""
-    if relative_index_fn:
-        jsonl_file = os.path.relpath(jsonl_file)
-    else:
-        jsonl_file = os.path.abspath(jsonl_file)
-
-    if use_alternative_index_mapping_dir:
-        index_mapping_dir = tmp_path / "subdir"
-        dataset_class(dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir))
-        # Index files should not be created in default location.
-        assert not os.path.isfile(f"{jsonl_file}.idx.npy")
-        assert not os.path.isfile(f"{jsonl_file}.idx.info")
-        if relative_index_fn:
-            # Remove leading ".." sequences.
-            while jsonl_file.startswith(("../")):
-                jsonl_file = jsonl_file.lstrip("../")
-        idx_fn = f"{str(index_mapping_dir)}/{jsonl_file}.idx"
-        assert os.path.isfile(f"{idx_fn}.npy")
-        assert os.path.isfile(f"{idx_fn}.info")
-    else:
-        text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0)
-        assert os.path.isfile(f"{jsonl_file}.idx.npy")
-        assert os.path.isfile(f"{jsonl_file}.idx.info")
diff --git a/tests/collections/nlp/test_prompt_learning.py b/tests/collections/nlp/test_prompt_learning.py
deleted file mode 100644
index 4597fe9ecef0..000000000000
--- a/tests/collections/nlp/test_prompt_learning.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-import pytest
-import torch
-
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import get_pseudo_tokens
-from nemo.collections.nlp.modules.common import VirtualPromptSource
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
-from nemo.core import Dataset
-
-
-def get_prompt_tuning_dataset(
-    dataset_path, tokenizer, virtual_prompt_source, task_templates, pseudo_tokens,
-):
-    dataset = GPTPromptLearningDataset(
-        data=[dataset_path],
-        tokenizer=tokenizer,
-        virtual_prompt_source=virtual_prompt_source,
-        task_templates=task_templates,
-        pseudo_tokens=pseudo_tokens,
-        pad_token_id=tokenizer.unk_id,
-        max_seq_length=512,
-        min_seq_length=1,
-    )
-
-    return dataset
-
-
-def create_temp_dataset():
-    example_dataset_a = [
-        {'taskname': 'task name A', 'text': 'Test sentence one, Answer: ', 'answer': 'test'} for i in range(24)
-    ]
-    example_dataset_b = [
-        {'taskname': 'task name B', 'question': 'This is a question', 'answer': 'test'} for i in range(13)
-    ]
-    example_dataset = example_dataset_a + example_dataset_b
-    temp_file_name = 'temp_dataset_file.jsonl'
-
-    with open(temp_file_name, 'w') as temp:
-        for example in example_dataset:
-            temp.write(json.dumps(example) + '\n')
-
-    return temp_file_name
-
-
-def get_task_templates():
-    task_templates = {}
-    task_templates['task name A'] = {
-        "prompt_template": "<|VIRTUAL_PROMPT_0|>{text}{answer}",
-        "prompt_template_fields": ['text', 'answer'],
-        "total_virtual_tokens": 5,
-        "virtual_token_splits": [5],
-        "truncate_field": None,
-        "answer_only_loss": True,
-        "answer_field": "answer",
-        "task_id_num": 0,
-    }
-    task_templates['task name B'] = {
-        "prompt_template": "<|VIRTUAL_PROMPT_0|>{question}<|VIRTUAL_PROMPT_1|>{answer}{extra}",
-        "prompt_template_fields": ['question', 'answer', 'extra'],
-        "total_virtual_tokens": 10,
-        "virtual_token_splits": [7, 3],
-        "truncate_field": None,
-        "answer_only_loss": False,
-        "answer_field": None,
-        "task_id_num": 1,
-    }
-    return task_templates
-
-
-class TestMegatronGPTPromptLearningDataset:
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    def test_init_prompt_learning_dataset(self):
-        tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer')
-        task_templates = get_task_templates()
-        dataset_path = create_temp_dataset()
-
-        # Setup virtual token place holders
-        total_virtual_tokens = 10
-        pseudo_tokens = get_pseudo_tokens(total_virtual_tokens)
-        tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens})
-
-        dataset = get_prompt_tuning_dataset(
-            dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens,
-        )
-
-        print(type(dataset))
-
-        assert isinstance(dataset, Dataset)
-
-        os.remove(dataset_path)
-
-    @pytest.mark.run_only_on('GPU')
-    @pytest.mark.unit
-    def test_prompt_learning_dataset_collate_fn_prompt_encoder(self):
-        tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer')
-        task_templates = get_task_templates()
-        dataset_path = create_temp_dataset()
-
-        # Setup virtual token place holders
-        total_virtual_tokens = 10
-        pseudo_tokens = get_pseudo_tokens(total_virtual_tokens)
-        tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens})
-
-        dataset = get_prompt_tuning_dataset(
-            dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens,
-        )
-
-        batch = [dataset[i] for i in range(8)]
-        batch = dataset.collate_fn(batch)
-
-        assert len(batch) == 6
-
-        _, _, _, _, _, taskname_ids = batch
-
-        assert list(taskname_ids[0].numpy()) == tokenizer.text_to_ids("task name A")
-
-        os.remove(dataset_path)
-
-
-if __name__ == "__main__":
-    t = TestMegatronGPTPromptLearningDataset()
-    t.test_init_prompt_learning_dataset()
-    t.test_prompt_learning_dataset_collate_fn_prompt_encoder()
-    print('-' * 50 + '\nALL PROMPT TUNING UNIT TESTS PASS!\n' + '-' * 50)
diff --git a/tests/collections/nlp/test_qna.py b/tests/collections/nlp/test_qna.py
deleted file mode 100644
index 4a470cacb711..000000000000
--- a/tests/collections/nlp/test_qna.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-
-import pytest
-import torch
-
-from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset
-from nemo.collections.nlp.data.question_answering.dataset.qa_gpt_dataset import GPTQADataset
-from nemo.collections.nlp.metrics.qa_metrics import QAMetrics
-
-
-@pytest.mark.unit
-def test_remove_articles():
-    sentences = [
-        "this is an apple",
-        "this is the apple",
-        "this is a fruit",
-    ]
-
-    expected_article_removed_sents = ["this is   apple", "this is   apple", "this is   fruit"]
-
-    article_removed_sents = [QAMetrics.remove_articles(sent) for sent in sentences]
-
-    assert article_removed_sents == expected_article_removed_sents
-
-
-@pytest.mark.unit
-def test_white_space_fix():
-    sentences = [
-        "sentence with a space",
-        "sentence with multiple   spaces",
-    ]
-
-    expected_white_space_fixed_sents = [
-        "sentence with a space",
-        "sentence with multiple spaces",
-    ]
-
-    white_space_fixed_sents = [QAMetrics.white_space_fix(sent) for sent in sentences]
-
-    assert white_space_fixed_sents == expected_white_space_fixed_sents
-
-
-@pytest.mark.unit
-def test_remove_punc():
-    sentence = "this, is. a! sentence: with; punctuations?"
-    expected_punc_removed_sent = "this is a sentence with punctuations"
-
-    punc_removed_sent = QAMetrics.remove_punc(sentence)
-
-    assert punc_removed_sent == expected_punc_removed_sent
-
-
-@pytest.mark.unit
-def test_get_normalized_tokens():
-    sentence = 'I am happy'
-    tokens = ['i', 'am', 'happy']
-    assert tokens == QAMetrics._get_normalized_tokens(sentence)
-
-    sentence = 'I am a person'
-    tokens = ['i', 'am', 'person']
-    assert tokens == QAMetrics._get_normalized_tokens(sentence)
-
-    sentence = 'I am a person.'
-    tokens = ['i', 'am', 'person']
-    assert tokens == QAMetrics._get_normalized_tokens(sentence)
-
-
-@pytest.mark.unit
-def test_get_one_f1():
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field)
-    assert f1 == 0.75
-
-    generated_field = ''
-    ground_truth_field = 'That'
-
-    f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field)
-    assert f1 == 0
-
-
-@pytest.mark.unit
-def test_get_one_exact_match():
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field)
-    assert em == 0
-
-    generated_field = 'That is so good!'
-    ground_truth_field = 'That is so good.'
-
-    em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field)
-    assert em == 1
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'that is so good'
-
-    em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field)
-    assert em == 1
-
-
-@pytest.mark.unit
-def test_split_into_words():
-    text = 'hi yo'
-    char_to_word_offset = [0, 0, 0, 1, 1]
-    doc_tokens = ["hi", "yo"]
-    output = QADataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-    text = 'i am good'
-    char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2]
-    doc_tokens = ["i", "am", 'good']
-    output = QADataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-
-@pytest.mark.unit
-def test_get_doc_spans():
-    all_doc_tokens = ['a'] * 15
-    max_tokens_for_doc = 10
-    doc_stride = 5
-    doc_spans = QADataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride)
-
-    assert len(doc_spans) == 2
-    assert doc_spans[0].start == 0
-    assert doc_spans[0].length == 10
-    assert doc_spans[1].start == 5
-    assert doc_spans[1].length == 10
-
-
-@pytest.mark.unit
-def test_get_average_dist_to_tok_start_and_end():
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_span = _DocSpan(start=0, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 3
-
-    assert 2 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 6 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=4)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 5 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-
-@pytest.mark.unit
-def test_keep_relevant_docspans():
-
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'all'
-    assert doc_spans == QADataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode)
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = -1
-    tok_end_position = -1
-
-    mode = 'only_positive'
-
-    expected_doc_spans = []
-    assert expected_doc_spans == QADataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'only_positive'
-
-    expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)]
-    assert expected_doc_spans == QADataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'limited_negative'
-
-    expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)]
-    assert expected_doc_spans == QADataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-
-@pytest.mark.unit
-def test_gpt_no_pad_loss_masking():
-    input_ids = [1] * 15 + [50257] * 15
-    input_ids = torch.tensor(input_ids)
-
-    input_attn_mask = [1] * 16 + [0] * 14
-    input_attn_mask = torch.Tensor(input_attn_mask)
-
-    training_mask_end = 10
-
-    expected_labels = [-100] * 10 + [1] * 5 + [50257] + [-100] * 14
-    expected_labels = torch.tensor(expected_labels)
-
-    labels = GPTQADataset.update_labels_for_no_pad_loss(input_ids, training_mask_end, input_attn_mask)
-
-    assert torch.all(labels.eq(expected_labels))
diff --git a/tests/collections/nlp/test_question_answering.py b/tests/collections/nlp/test_question_answering.py
deleted file mode 100644
index c4aacf449c50..000000000000
--- a/tests/collections/nlp/test_question_answering.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-from pydoc import doc
-
-import pytest
-
-from nemo.collections.nlp.data.question_answering_squad.qa_dataset import SquadDataset
-from nemo.collections.nlp.data.question_answering_squad.qa_squad_processing import (
-    _get_tokens,
-    exact_match_score,
-    f1_score,
-)
-
-
-@pytest.mark.unit
-def test_get_tokens():
-    sentence = 'I am happy'
-    tokens = ['i', 'am', 'happy']
-    assert tokens == _get_tokens(sentence)
-
-    sentence = 'I am a person'
-    tokens = ['i', 'am', 'person']
-    assert tokens == _get_tokens(sentence)
-
-    sentence = 'I am a person.'
-    tokens = ['i', 'am', 'person']
-    assert tokens == _get_tokens(sentence)
-
-
-@pytest.mark.unit
-def test_f1_score():
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    f1 = f1_score(generated_field, ground_truth_field)
-    assert f1 == 0.75
-
-    generated_field = ''
-    ground_truth_field = 'That'
-
-    f1 = f1_score(generated_field, ground_truth_field)
-    assert f1 == 0
-
-
-@pytest.mark.unit
-def test_exact_match_score():
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'That is so awesome'
-
-    em = exact_match_score(generated_field, ground_truth_field)
-    assert em == 0
-
-    generated_field = 'That is so good!'
-    ground_truth_field = 'That is so good.'
-
-    em = exact_match_score(generated_field, ground_truth_field)
-    assert em == 1
-
-    generated_field = 'That is so good'
-    ground_truth_field = 'that is so good'
-
-    em = exact_match_score(generated_field, ground_truth_field)
-    assert em == 1
-
-
-@pytest.mark.unit
-def test_split_into_words():
-    text = 'hi yo'
-    char_to_word_offset = [0, 0, 0, 1, 1]
-    doc_tokens = ["hi", "yo"]
-    output = SquadDataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-    text = 'i am good'
-    char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2]
-    doc_tokens = ["i", "am", 'good']
-    output = SquadDataset.split_into_words(text)
-    assert output[0] == doc_tokens
-    assert output[1] == char_to_word_offset
-
-
-@pytest.mark.unit
-def test_get_doc_spans():
-    all_doc_tokens = ['a'] * 15
-    max_tokens_for_doc = 10
-    doc_stride = 5
-    doc_spans = SquadDataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride)
-
-    assert len(doc_spans) == 2
-    assert doc_spans[0].start == 0
-    assert doc_spans[0].length == 10
-    assert doc_spans[1].start == 5
-    assert doc_spans[1].length == 10
-
-
-@pytest.mark.unit
-def test_get_average_dist_to_tok_start_and_end():
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_span = _DocSpan(start=0, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 3
-
-    assert 2 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=5)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 6 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-    doc_span = _DocSpan(start=5, length=4)
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    assert 5 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position)
-
-
-@pytest.mark.unit
-def test_keep_relevant_docspans():
-
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'all'
-    assert doc_spans == SquadDataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode)
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = -1
-    tok_end_position = -1
-
-    mode = 'only_positive'
-
-    expected_doc_spans = []
-    assert expected_doc_spans == SquadDataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'only_positive'
-
-    expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)]
-    assert expected_doc_spans == SquadDataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
-
-    doc_spans = [_DocSpan(start=start, length=5) for start in range(15)]
-
-    tok_start_position = 1
-    tok_end_position = 2
-
-    mode = 'limited_negative'
-
-    expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)]
-    assert expected_doc_spans == SquadDataset.keep_relevant_docspans(
-        doc_spans, tok_start_position, tok_end_position, mode
-    )
diff --git a/tests/collections/nlp/test_spellchecking_asr_customization.py b/tests/collections/nlp/test_spellchecking_asr_customization.py
deleted file mode 100644
index 8e4d6e9a7b8f..000000000000
--- a/tests/collections/nlp/test_spellchecking_asr_customization.py
+++ /dev/null
@@ -1,1102 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-from transformers import AutoTokenizer
-
-from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder
-from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (
-    apply_replacements_to_text,
-    substitute_replacements_in_text,
-)
-
-
-@pytest.mark.unit
-def test_substitute_replacements_in_text():
-    text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement"
-    replacements = [(66, 75, 'pro-terra', 0.99986), (101, 109, 'navistar', 0.996)]
-    gold_text = "we began the further diversification of our revenue base with the pro-terra supply agreement and the navistar joint development agreement"
-    corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=False)
-    assert corrected_text == gold_text
-
-    gold_text_no_hyphen = "we began the further diversification of our revenue base with the pro terra supply agreement and the navistar joint development agreement"
-    corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=True)
-    assert corrected_text == gold_text_no_hyphen
-
-
-@pytest.mark.unit
-def test_apply_replacements_to_text():
-
-    # min_prob = 0.5
-    # dp_data = None,
-    # min_dp_score_per_symbol: float = -99.9
-
-    # test more than one fragment to replace, test multiple same replacements
-    text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement"
-    replacements = [
-        (66, 75, 'proterra', 0.99986),
-        (66, 75, 'proterra', 0.9956),
-        (101, 109, 'navistar', 0.93),
-        (101, 109, 'navistar', 0.91),
-        (101, 109, 'navistar', 0.92),
-    ]
-    gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navistar joint development agreement"
-    corrected_text = apply_replacements_to_text(
-        text, replacements, min_prob=0.5, replace_hyphen_to_space=False, dp_data=None
-    )
-    assert corrected_text == gold_text
-
-    # test that min_prob works
-    gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navastar joint development agreement"
-    corrected_text = apply_replacements_to_text(
-        text, replacements, min_prob=0.95, replace_hyphen_to_space=False, dp_data=None
-    )
-    assert corrected_text == gold_text
-
-
-@pytest.fixture()
-def bert_example_builder():
-    tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D")
-    label_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10}
-    semiotic_classes = {"PLAIN": 0, "CUSTOM": 1}
-    max_seq_len = 256
-    builder = BertExampleBuilder(label_map, semiotic_classes, tokenizer, max_seq_len)
-    return builder
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_creation(bert_example_builder):
-    assert bert_example_builder._tokenizer is not None
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_builder_get_spans(bert_example_builder):
-    span_info_parts = ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"]
-    gold_sorted_spans = [(1, 1, 8), (1, 38, 42), (1, 43, 47), (1, 48, 53)]
-    spans = bert_example_builder._get_spans(span_info_parts)
-    spans.sort()
-    assert spans == gold_sorted_spans
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_builder_get_fragment_indices(bert_example_builder):
-    hyp = "a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w"
-    targets = [1]
-    # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w
-    # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-    span_info_parts = ["CUSTOM 8 17"]
-    gold_sorted_fragment_indices = [(7, 18, 1), (11, 18, 1)]
-    fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts)
-    fragment_indices.sort()
-    assert fragment_indices == gold_sorted_fragment_indices
-
-    # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w
-    # 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-    span_info_parts = ["CUSTOM 10 16"]
-    gold_sorted_fragment_indices = [(11, 18, 1)]
-    fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts)
-    fragment_indices.sort()
-    assert fragment_indices == gold_sorted_fragment_indices
-
-
-@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason")
-@pytest.mark.with_downloads
-@pytest.mark.unit
-def test_builder_get_input_features(bert_example_builder):
-    hyp = "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o"
-    ref = "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y"
-    targets = [1, 3]
-    span_info_parts = ["CUSTOM 12 23", "CUSTOM 28 41"]
-
-    gold_tags = [
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        0,
-        0,
-        0,
-        0,
-        0,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-    ]
-    gold_input_ids = [
-        101,
-        1037,
-        1055,
-        1056,
-        1054,
-        1051,
-        1050,
-        1051,
-        1049,
-        1041,
-        1054,
-        1055,
-        1035,
-        1040,
-        1045,
-        1040,
-        1045,
-        1041,
-        1035,
-        1055,
-        1051,
-        1049,
-        1051,
-        1050,
-        1035,
-        1037,
-        1050,
-        1040,
-        1035,
-        1056,
-        1054,
-        1045,
-        1055,
-        1056,
-        1045,
-        1037,
-        1050,
-        1035,
-        1043,
-        1048,
-        1048,
-        1051,
-        102,
-        1040,
-        1045,
-        1040,
-        1045,
-        1041,
-        1054,
-        1035,
-        1055,
-        1037,
-        1057,
-        1049,
-        1051,
-        1050,
-        102,
-        1037,
-        1055,
-        1056,
-        1054,
-        1051,
-        1050,
-        1051,
-        1049,
-        1045,
-        1041,
-        102,
-        1056,
-        1054,
-        1045,
-        1055,
-        1056,
-        1037,
-        1050,
-        1035,
-        1043,
-        1057,
-        1045,
-        1048,
-        1048,
-        1051,
-        1056,
-        102,
-        1056,
-        1054,
-        1045,
-        1055,
-        1056,
-        1041,
-        1055,
-        1055,
-        1041,
-        102,
-        1049,
-        1051,
-        1050,
-        1037,
-        1040,
-        1041,
-        102,
-        1039,
-        1044,
-        1054,
-        1045,
-        1055,
-        1056,
-        1045,
-        1037,
-        1050,
-        102,
-        1037,
-        1055,
-        1056,
-        1054,
-        1051,
-        1050,
-        1051,
-        1049,
-        1041,
-        1054,
-        102,
-        1055,
-        1051,
-        1048,
-        1051,
-        1049,
-        1051,
-        1050,
-        102,
-        1040,
-        1045,
-        1040,
-        1045,
-        1040,
-        1045,
-        1040,
-        1045,
-        1040,
-        1045,
-        102,
-        1049,
-        1041,
-        1054,
-        1039,
-        1061,
-        102,
-    ]
-    gold_input_mask = [
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-    ]
-    gold_segment_ids = [
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        2,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        3,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        4,
-        5,
-        5,
-        5,
-        5,
-        5,
-        5,
-        5,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        6,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        7,
-        8,
-        8,
-        8,
-        8,
-        8,
-        8,
-        8,
-        8,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        9,
-        10,
-        10,
-        10,
-        10,
-        10,
-        10,
-    ]
-    gold_labels_mask = [
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-    ]
-    gold_input_ids_for_subwords = [
-        101,
-        26357,
-        2106,
-        2666,
-        2061,
-        8202,
-        1998,
-        13012,
-        16643,
-        2319,
-        1043,
-        7174,
-        102,
-        2106,
-        3771,
-        7842,
-        2819,
-        2239,
-        102,
-        28625,
-        3630,
-        9856,
-        102,
-        9822,
-        26458,
-        7174,
-        2102,
-        102,
-        13012,
-        13473,
-        11393,
-        102,
-        13813,
-        3207,
-        102,
-        3017,
-        102,
-        15211,
-        102,
-        9168,
-        102,
-        2106,
-        28173,
-        4305,
-        4305,
-        102,
-        8673,
-        102,
-    ]
-    gold_input_mask_for_subwords = [
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-    ]
-    gold_segment_ids_for_subwords = [
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        2,
-        2,
-        2,
-        2,
-        3,
-        3,
-        3,
-        3,
-        3,
-        4,
-        4,
-        4,
-        4,
-        5,
-        5,
-        5,
-        6,
-        6,
-        7,
-        7,
-        8,
-        8,
-        9,
-        9,
-        9,
-        9,
-        9,
-        10,
-        10,
-    ]
-    gold_character_pos_to_subword_pos = [
-        0,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        1,
-        2,
-        2,
-        2,
-        3,
-        3,
-        3,
-        4,
-        4,
-        5,
-        5,
-        5,
-        5,
-        6,
-        6,
-        6,
-        6,
-        7,
-        7,
-        7,
-        8,
-        8,
-        8,
-        9,
-        9,
-        9,
-        10,
-        11,
-        11,
-        11,
-        12,
-        13,
-        13,
-        13,
-        14,
-        14,
-        14,
-        14,
-        15,
-        15,
-        16,
-        16,
-        17,
-        17,
-        18,
-        19,
-        19,
-        19,
-        19,
-        19,
-        20,
-        20,
-        21,
-        21,
-        21,
-        22,
-        23,
-        23,
-        23,
-        23,
-        23,
-        23,
-        23,
-        23,
-        24,
-        24,
-        24,
-        25,
-        25,
-        25,
-        26,
-        27,
-        28,
-        28,
-        28,
-        29,
-        29,
-        29,
-        30,
-        30,
-        30,
-        31,
-        32,
-        32,
-        32,
-        32,
-        33,
-        33,
-        34,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        35,
-        36,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        37,
-        38,
-        39,
-        39,
-        39,
-        39,
-        39,
-        39,
-        39,
-        40,
-        41,
-        41,
-        41,
-        42,
-        42,
-        42,
-        43,
-        43,
-        44,
-        44,
-        45,
-        46,
-        46,
-        46,
-        46,
-        46,
-        47,
-    ]
-
-    tags = [0 for _ in hyp.split()]
-    for p, t in zip(span_info_parts, targets):
-        c, start, end = p.split(" ")
-        start = int(start)
-        end = int(end)
-        tags[start:end] = [t for i in range(end - start)]
-
-    # get input features for characters
-    (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = bert_example_builder._get_input_features(
-        hyp=hyp, ref=ref, tags=tags
-    )
-
-    # get input features for words
-    hyp_with_words = hyp.replace(" ", "").replace("_", " ")
-    ref_with_words = ref.replace(" ", "").replace("_", " ")
-    (
-        input_ids_for_subwords,
-        input_mask_for_subwords,
-        segment_ids_for_subwords,
-        _,
-        _,
-        _,
-        _,
-    ) = bert_example_builder._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None)
-
-    character_pos_to_subword_pos = bert_example_builder._map_characters_to_subwords(input_ids, input_ids_for_subwords)
-
-    assert tags == gold_tags
-    assert input_ids == gold_input_ids
-    assert input_mask == gold_input_mask
-    assert segment_ids == gold_segment_ids
-    assert labels_mask == gold_labels_mask
-    assert input_ids_for_subwords == gold_input_ids_for_subwords
-    assert input_mask_for_subwords == gold_input_mask_for_subwords
-    assert segment_ids_for_subwords == gold_segment_ids_for_subwords
-    assert character_pos_to_subword_pos == gold_character_pos_to_subword_pos
diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb
deleted file mode 100644
index ddd3bdd4f929..000000000000
--- a/tutorials/nlp/Dialogue.ipynb
+++ /dev/null
@@ -1,717 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jaosjY4rGRNH"
-      },
-      "source": [
-        "# Installing NeMo from source\n",
-        "\n",
-        "\n",
-        "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-        "\n",
-        "Instructions for setting up Colab are as follows:\n",
-        "1. Open a new Python 3 notebook.\n",
-        "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-        "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-        "4. Run the cell below to set up dependencies.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "goQzOSflEq27"
-      },
-      "outputs": [],
-      "source": [
-        "import os \n",
-        "BRANCH = 'main'\n",
-        "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n",
-        "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n",
-        "os.chdir('NeMo')\n",
-        "!./reinstall.sh\n",
-        "os.chdir('..')\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GjQ_z_xQMDIb"
-      },
-      "source": [
-        "# Overview\n",
-        "\n",
-        "There are three tasks as part of this tutorial\n",
-        "\n",
-        "1. Intent and Slot Classification using Assistant Dataset and a BERT model\n",
-        "2. Intent Classification using Schema Guided Dialogue Dataset and a GPT2 model\n",
-        "3. Answer Extender using MS Marco NLGen Dataset and a BART model\n",
-        "\n",
-        "Feel free to skip to the task that interests you most after installing NeMo from source."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AS-zwy8tEq2_"
-      },
-      "source": [
-        "# 1. Intent and Slot Classification using Assistant Dataset\n",
-        "\n",
-        "## 1.1 Task Description\n",
-        "\n",
-        "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n",
-        "for this Intent in a query.\n",
-        "For example, in the query:  `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n",
-        "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n",
-        "Intents and Slots names are usually task specific and defined as labels in the training data.\n",
-        "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n",
-        "\n",
-        "Our model enables to train and then detect both of these tasks together.\n",
-        "\n",
-        "Note: There is a similar model available at [Joint Intent Slot Classification Colab](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb). However, this model only support BERT style models while the model in this tutorial supports other types of models such as GPT2. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FJk_UAyeEq3B"
-      },
-      "source": [
-        "\n",
-        "## 1.2 Download Assistant dataset and convert to NeMo format\n",
-        "\n",
-        "This is a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n",
-        "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n",
-        "\n",
-        "An example is:\n",
-        "\n",
-        "* utterance: what alarms have i set for tomorrow \n",
-        "* intent: alarm_query\n",
-        "* slots: date(tomorrow)\n",
-        "\n",
-        "\n",
-        "Note: While only the assistant dataset is used here, import_dataset.py is also compatible with ATIS and SNIPS"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "jjOVdGX2Eq3D"
-      },
-      "outputs": [],
-      "source": [
-        "# download and unzip the example dataset from github\n",
-        "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n",
-        "!unzip master.zip\n",
-        "# convert the dataset to the NeMo format\n",
-        "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5n81deZsEq3G"
-      },
-      "source": [
-        "## 1.3 Training and/or Testing the model\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "eoYc_8jhEq3G"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.data_dir='./assistant' \\\n",
-        "  model.dataset.dialogues_example_dir='./assistant_bert_examples' \\\n",
-        "  model.dataset.task='assistant' \\\n",
-        "  model.language_model.pretrained_model_name='bert-base-uncased' \\\n",
-        "  exp_manager.create_wandb_logger=False)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GaPmHjayEbg8"
-      },
-      "source": [
-        "**Results after 3 epochs**\n",
-        "\n",
-        "Intent report: \n",
-        "```\n",
-        "    label                                                precision    recall       f1           support   \n",
-        "    alarm_query (label_id: 0)                              100.00      94.44      97.14         18\n",
-        "    alarm_remove (label_id: 1)                             100.00      90.91      95.24         11\n",
-        "    alarm_set (label_id: 2)                                 94.12      94.12      94.12         17\n",
-        "    audio_volume_down (label_id: 3)                         75.00      42.86      54.55          7\n",
-        "    audio_volume_mute (label_id: 4)                        100.00      92.86      96.30         14\n",
-        "    audio_volume_up (label_id: 5)                           72.22     100.00      83.87         13\n",
-        "    calendar_query (label_id: 6)                            87.50      77.78      82.35         18\n",
-        "    calendar_remove (label_id: 7)                           94.44     100.00      97.14         17\n",
-        "    calendar_set (label_id: 8)                              94.44      94.44      94.44         18\n",
-        "    cooking_recipe (label_id: 9)                            85.71      70.59      77.42         17\n",
-        "    datetime_convert (label_id: 10)                         88.89     100.00      94.12          8\n",
-        "    datetime_query (label_id: 11)                           89.47     100.00      94.44         17\n",
-        "    email_addcontact (label_id: 12)                         80.00     100.00      88.89          8\n",
-        "    email_query (label_id: 13)                             100.00      83.33      90.91         18\n",
-        "    email_querycontact (label_id: 14)                       78.95      88.24      83.33         17\n",
-        "    email_sendemail (label_id: 15)                          94.44      94.44      94.44         18\n",
-        "    general_affirm (label_id: 16)                          100.00     100.00     100.00         17\n",
-        "    general_commandstop (label_id: 17)                     100.00     100.00     100.00         18\n",
-        "    general_confirm (label_id: 18)                         100.00     100.00     100.00         17\n",
-        "    general_dontcare (label_id: 19)                        100.00     100.00     100.00         18\n",
-        "    general_explain (label_id: 20)                         100.00     100.00     100.00         17\n",
-        "    general_joke (label_id: 21)                             91.67     100.00      95.65         11\n",
-        "    general_negate (label_id: 22)                          100.00     100.00     100.00         18\n",
-        "    general_praise (label_id: 23)                          100.00     100.00     100.00         17\n",
-        "    general_quirky (label_id: 24)                           60.00      50.00      54.55         18\n",
-        "    general_repeat (label_id: 25)                          100.00     100.00     100.00         17\n",
-        "    iot_cleaning (label_id: 26)                            100.00     100.00     100.00         15\n",
-        "    iot_coffee (label_id: 27)                               85.71     100.00      92.31         18\n",
-        "    iot_hue_lightchange (label_id: 28)                     100.00      94.12      96.97         17\n",
-        "    iot_hue_lightdim (label_id: 29)                        100.00     100.00     100.00         12\n",
-        "    iot_hue_lightoff (label_id: 30)                        100.00     100.00     100.00         17\n",
-        "    iot_hue_lighton (label_id: 31)                         100.00      50.00      66.67          4\n",
-        "    iot_hue_lightup (label_id: 32)                          84.62      91.67      88.00         12\n",
-        "    iot_wemo_off (label_id: 33)                            100.00     100.00     100.00          9\n",
-        "    iot_wemo_on (label_id: 34)                             100.00      85.71      92.31          7\n",
-        "    lists_createoradd (label_id: 35)                        90.00     100.00      94.74         18\n",
-        "    lists_query (label_id: 36)                             100.00      94.12      96.97         17\n",
-        "    lists_remove (label_id: 37)                             88.89      88.89      88.89         18\n",
-        "    music_likeness (label_id: 38)                          100.00      93.75      96.77         16\n",
-        "    music_query (label_id: 39)                             100.00     100.00     100.00         17\n",
-        "    music_settings (label_id: 40)                           77.78     100.00      87.50          7\n",
-        "    news_query (label_id: 41)                               72.73      88.89      80.00         18\n",
-        "    play_audiobook (label_id: 42)                          100.00     100.00     100.00         17\n",
-        "    play_game (label_id: 43)                                93.75      83.33      88.24         18\n",
-        "    play_music (label_id: 44)                               85.00     100.00      91.89         17\n",
-        "    play_podcasts (label_id: 45)                           100.00      88.89      94.12         18\n",
-        "    play_radio (label_id: 46)                               84.21      94.12      88.89         17\n",
-        "    qa_currency (label_id: 47)                              85.00      94.44      89.47         18\n",
-        "    qa_definition (label_id: 48)                            89.47     100.00      94.44         17\n",
-        "    qa_factoid (label_id: 49)                               64.00      88.89      74.42         18\n",
-        "    qa_maths (label_id: 50)                                 84.62      84.62      84.62         13\n",
-        "    qa_stock (label_id: 51)                                 87.50      77.78      82.35         18\n",
-        "    recommendation_events (label_id: 52)                    87.50      82.35      84.85         17\n",
-        "    recommendation_locations (label_id: 53)                 83.33      83.33      83.33         18\n",
-        "    recommendation_movies (label_id: 54)                   100.00      60.00      75.00         10\n",
-        "    social_post (label_id: 55)                             100.00      94.12      96.97         17\n",
-        "    social_query (label_id: 56)                            100.00      82.35      90.32         17\n",
-        "    takeaway_order (label_id: 57)                           92.31      70.59      80.00         17\n",
-        "    takeaway_query (label_id: 58)                           93.75      83.33      88.24         18\n",
-        "    transport_query (label_id: 59)                          81.25      76.47      78.79         17\n",
-        "    transport_taxi (label_id: 60)                          100.00     100.00     100.00         16\n",
-        "    transport_ticket (label_id: 61)                         85.00      94.44      89.47         18\n",
-        "    transport_traffic (label_id: 62)                        93.75      88.24      90.91         17\n",
-        "    weather_query (label_id: 63)                            89.47     100.00      94.44         17\n",
-        "    -------------------\n",
-        "    micro avg                                               91.16      91.16      91.16        996\n",
-        "    macro avg                                               91.66      90.44      90.48        996\n",
-        "    weighted avg                                            91.72      91.16      91.04        996\n",
-        "```\n",
-        "Slot report: \n",
-        "```\n",
-        "    label                                                precision    recall       f1           support   \n",
-        "    alarm_type (label_id: 0)                                 0.00       0.00       0.00          2\n",
-        "    app_name (label_id: 1)                                   0.00       0.00       0.00          1\n",
-        "    artist_name (label_id: 2)                               17.39      80.00      28.57          5\n",
-        "    audiobook_author (label_id: 3)                           0.00       0.00       0.00          0\n",
-        "    audiobook_name (label_id: 4)                            64.52      74.07      68.97         27\n",
-        "    business_name (label_id: 5)                             81.48      84.62      83.02         52\n",
-        "    business_type (label_id: 6)                             80.00      80.00      80.00         20\n",
-        "    change_amount (label_id: 7)                             57.14      66.67      61.54          6\n",
-        "    coffee_type (label_id: 8)                              100.00      33.33      50.00          3\n",
-        "    color_type (label_id: 9)                                75.00      92.31      82.76         13\n",
-        "    cooking_type (label_id: 10)                              0.00       0.00       0.00          1\n",
-        "    currency_name (label_id: 11)                           100.00      96.43      98.18         28\n",
-        "    date (label_id: 12)                                     87.88      87.22      87.55        133\n",
-        "    definition_word (label_id: 13)                          85.00      85.00      85.00         20\n",
-        "    device_type (label_id: 14)                              84.75      76.92      80.65         65\n",
-        "    drink_type (label_id: 15)                                0.00       0.00       0.00          0\n",
-        "    email_address (label_id: 16)                            64.29     100.00      78.26          9\n",
-        "    email_folder (label_id: 17)                            100.00      50.00      66.67          2\n",
-        "    event_name (label_id: 18)                               80.00      75.00      77.42         64\n",
-        "    food_type (label_id: 19)                                84.38      77.14      80.60         35\n",
-        "    game_name (label_id: 20)                                93.55      78.38      85.29         37\n",
-        "    game_type (label_id: 21)                                 0.00       0.00       0.00          0\n",
-        "    general_frequency (label_id: 22)                         0.00       0.00       0.00          9\n",
-        "    house_place (label_id: 23)                              80.95      91.89      86.08         37\n",
-        "    ingredient (label_id: 24)                                0.00       0.00       0.00          1\n",
-        "    joke_type (label_id: 25)                               100.00     100.00     100.00          5\n",
-        "    list_name (label_id: 26)                                89.29      69.44      78.12         36\n",
-        "    meal_type (label_id: 27)                                 0.00       0.00       0.00          3\n",
-        "    media_type (label_id: 28)                               78.95      83.33      81.08         36\n",
-        "    movie_name (label_id: 29)                                0.00       0.00       0.00          1\n",
-        "    movie_type (label_id: 30)                                0.00       0.00       0.00          0\n",
-        "    music_album (label_id: 31)                               0.00       0.00       0.00          0\n",
-        "    music_descriptor (label_id: 32)                          0.00       0.00       0.00          2\n",
-        "    music_genre (label_id: 33)                              81.82      90.00      85.71         10\n",
-        "    news_topic (label_id: 34)                               80.00      30.77      44.44         13\n",
-        "    order_type (label_id: 35)                              100.00      42.11      59.26         19\n",
-        "    person (label_id: 36)                                   70.79     100.00      82.89         63\n",
-        "    personal_info (label_id: 37)                            76.19      94.12      84.21         17\n",
-        "    place_name (label_id: 38)                               82.86      84.47      83.65        103\n",
-        "    player_setting (label_id: 39)                           75.00      42.86      54.55          7\n",
-        "    playlist_name (label_id: 40)                             0.00       0.00       0.00          3\n",
-        "    podcast_descriptor (label_id: 41)                       92.31      54.55      68.57         22\n",
-        "    podcast_name (label_id: 42)                             66.67      16.67      26.67         12\n",
-        "    radio_name (label_id: 43)                               94.87      94.87      94.87         39\n",
-        "    relation (label_id: 44)                                 90.91      90.91      90.91         11\n",
-        "    song_name (label_id: 45)                               100.00       6.67      12.50         15\n",
-        "    time (label_id: 46)                                     77.57      84.69      80.98         98\n",
-        "    time_zone (label_id: 47)                                44.44     100.00      61.54          4\n",
-        "    timeofday (label_id: 48)                                86.96      80.00      83.33         25\n",
-        "    transport_agency (label_id: 49)                         80.00      57.14      66.67          7\n",
-        "    transport_descriptor (label_id: 50)                      0.00       0.00       0.00          5\n",
-        "    transport_name (label_id: 51)                            0.00       0.00       0.00          0\n",
-        "    transport_type (label_id: 52)                           88.89     100.00      94.12         40\n",
-        "    weather_descriptor (label_id: 53)                       87.50      87.50      87.50          8\n",
-        "    O (label_id: 54)                                        97.07      97.52      97.30       5408\n",
-        "    -------------------\n",
-        "    micro avg                                               94.24      94.24      94.24       6582\n",
-        "    macro avg                                               64.87      59.93      59.17       6582\n",
-        "    weighted avg                                            94.23      94.24      93.95       6582\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-44x5PqyrOeQ"
-      },
-      "source": [
-        "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "QyqQbpR4rNHT"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n",
-        "# model.dataset.target_template=with_slots: this perform slot filling with intent classification\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.data_dir='./assistant' \\\n",
-        "  model.dataset.dialogues_example_dir='./assistant_gpt2_examples' \\\n",
-        "  model.dataset.task='assistant' \\\n",
-        "  model.language_model.pretrained_model_name='gpt2' \\\n",
-        "  trainer.max_epochs=1 \\\n",
-        "  model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n",
-        "  model.dataset.target_template=with_slots \\\n",
-        "  model.dataset.eval_mode=generation \\\n",
-        "  exp_manager.create_wandb_logger=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FbQ-6TVM1yQg"
-      },
-      "source": [
-        "**After 1 epoch:**\n",
-        "\n",
-        "More epochs would be helpful\n",
-        "\n",
-        "Intent report:\n",
-        "\n",
-        "  ```\n",
-        "  label                                                precision    recall       f1           support   \n",
-        "    transport query (label_id: 0)                           72.73      84.21      78.05         19\n",
-        "    weather query (label_id: 1)                             94.74      94.74      94.74         19\n",
-        "    play game (label_id: 2)                                 92.86      68.42      78.79         19\n",
-        "    qa currency (label_id: 3)                              100.00     100.00     100.00         19\n",
-        "    qa maths (label_id: 4)                                 100.00     100.00     100.00         14\n",
-        "    iot wemo off (label_id: 5)                              75.00     100.00      85.71          9\n",
-        "    datetime convert (label_id: 6)                          46.67      87.50      60.87          8\n",
-        "    email addcontact (label_id: 7)                          70.00      87.50      77.78          8\n",
-        "    music likeness (label_id: 8)                            57.89      61.11      59.46         18\n",
-        "    music query (label_id: 9)                               78.57      57.89      66.67         19\n",
-        "    general negate (label_id: 10)                           95.00     100.00      97.44         19\n",
-        "    email sendemail (label_id: 11)                          92.86      68.42      78.79         19\n",
-        "    general affirm (label_id: 12)                           95.00     100.00      97.44         19\n",
-        "    play audiobook (label_id: 13)                           57.69      78.95      66.67         19\n",
-        "    general praise (label_id: 14)                          100.00      94.74      97.30         19\n",
-        "    alarm set (label_id: 15)                                85.71      94.74      90.00         19\n",
-        "    general explain (label_id: 16)                         100.00      89.47      94.44         19\n",
-        "    iot wemo on (label_id: 17)                              83.33      71.43      76.92          7\n",
-        "    cooking recipe (label_id: 18)                           90.00      94.74      92.31         19\n",
-        "    music settings (label_id: 19)                           60.00      42.86      50.00          7\n",
-        "    social post (label_id: 20)                              84.21      84.21      84.21         19\n",
-        "    recommendation events (label_id: 21)                    72.73      84.21      78.05         19\n",
-        "    audio volume up (label_id: 22)                          76.47     100.00      86.67         13\n",
-        "    lists remove (label_id: 23)                             73.08     100.00      84.44         19\n",
-        "    transport ticket (label_id: 24)                         94.74      94.74      94.74         19\n",
-        "    general joke (label_id: 25)                            100.00     100.00     100.00         12\n",
-        "    play podcasts (label_id: 26)                            94.12      84.21      88.89         19\n",
-        "    iot hue lightchange (label_id: 27)                      85.71      63.16      72.73         19\n",
-        "    audio volume mute (label_id: 28)                        84.62      73.33      78.57         15\n",
-        "    general dontcare (label_id: 29)                         95.00     100.00      97.44         19\n",
-        "    qa definition (label_id: 30)                            77.27      89.47      82.93         19\n",
-        "    email querycontact (label_id: 31)                       58.33      73.68      65.12         19\n",
-        "    general commandstop (label_id: 32)                     100.00     100.00     100.00         19\n",
-        "    calendar remove (label_id: 33)                          94.44      89.47      91.89         19\n",
-        "    news query (label_id: 34)                              100.00      57.89      73.33         19\n",
-        "    calendar query (label_id: 35)                           63.16      63.16      63.16         19\n",
-        "    social query (label_id: 36)                             88.24      83.33      85.71         18\n",
-        "    transport traffic (label_id: 37)                        90.48     100.00      95.00         19\n",
-        "    transport taxi (label_id: 38)                          100.00      94.44      97.14         18\n",
-        "    alarm query (label_id: 39)                             100.00      94.74      97.30         19\n",
-        "    iot hue lightoff (label_id: 40)                         88.89      84.21      86.49         19\n",
-        "    takeaway order (label_id: 41)                           81.25      68.42      74.29         19\n",
-        "    iot coffee (label_id: 42)                              100.00      94.74      97.30         19\n",
-        "    recommendation movies (label_id: 43)                    75.00      90.00      81.82         10\n",
-        "    iot hue lightup (label_id: 44)                          78.57      78.57      78.57         14\n",
-        "    email query (label_id: 45)                              85.71      94.74      90.00         19\n",
-        "    lists createoradd (label_id: 46)                        82.35      73.68      77.78         19\n",
-        "    play radio (label_id: 47)                               84.21      84.21      84.21         19\n",
-        "    audio volume down (label_id: 48)                       100.00      87.50      93.33          8\n",
-        "    general quirky (label_id: 49)                           30.00      15.79      20.69         19\n",
-        "    play music (label_id: 50)                               71.43      52.63      60.61         19\n",
-        "    qa stock (label_id: 51)                                 90.48     100.00      95.00         19\n",
-        "    iot cleaning (label_id: 52)                             93.33      87.50      90.32         16\n",
-        "    iot hue lightdim (label_id: 53)                        100.00     100.00     100.00         12\n",
-        "    recommendation locations (label_id: 54)                100.00      89.47      94.44         19\n",
-        "    general repeat (label_id: 55)                          100.00     100.00     100.00         19\n",
-        "    takeaway query (label_id: 56)                           77.27      89.47      82.93         19\n",
-        "    alarm remove (label_id: 57)                            100.00     100.00     100.00         11\n",
-        "    datetime query (label_id: 58)                           75.00      63.16      68.57         19\n",
-        "    iot hue lighton (label_id: 59)                          60.00     100.00      75.00          3\n",
-        "    qa factoid (label_id: 60)                               50.00      57.89      53.66         19\n",
-        "    calendar set (label_id: 61)                             75.00      78.95      76.92         19\n",
-        "    general confirm (label_id: 62)                         100.00     100.00     100.00         19\n",
-        "    lists query (label_id: 63)                              66.67      73.68      70.00         19\n",
-        "    label_id: 64                                             0.00       0.00       0.00          0\n",
-        "    -------------------\n",
-        "    micro avg                                               83.55      83.55      83.55       1076\n",
-        "    macro avg                                               83.53      83.93      83.01       1076\n",
-        "    weighted avg                                            84.26      83.55      83.30       1076\n",
-        "    \n",
-        "```\n",
-        "\n",
-        "```\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "       Test metric             DataLoader 0\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "        intent_f1            83.55018615722656\n",
-        "    intent_precision         83.55018615722656\n",
-        "      intent_recall          83.55018615722656\n",
-        "         slot_f1             73.99985919756773\n",
-        "slot_joint_goal_accuracy     65.89219330855019\n",
-        "     slot_precision          73.85223048327137\n",
-        "       slot_recall           74.14807930607186\n",
-        "  test_intent_accuracy       83.55018587360595\n",
-        "     test_loss_epoch       0.019178826361894608\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Gd42arYoEq3J"
-      },
-      "source": [
-        "# 2. Schema Guided Dialogue (SGD)\n",
-        "\n",
-        "## 2.1 Task Description\n",
-        "---\n",
-        "\n",
-        "SGD is a multi-domain intent classification dataset from Google with close to 100k examples.\n",
-        "\n",
-        "An example is:\n",
-        "\n",
-        "* utterance: I will be eating there at 11:30 am so make the reservation for then.\n",
-        "* intent: ReserveRestaurant\n",
-        "* slots: {\"time\": \"11:30 am\"}\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "neH8rXwjEq3J"
-      },
-      "source": [
-        "## 2.2 Download the dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "IgD8eavfJ5pi"
-      },
-      "outputs": [],
-      "source": [
-        "!git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7G7uPrUpEq3J"
-      },
-      "source": [
-        "## 2.3 Training and/or Testing the model\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gqo-rwQlEq3K"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n",
-        "\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.data_dir='./dstc8-schema-guided-dialogue' \\\n",
-        "  model.dataset.dialogues_example_dir='./sgd_gpt2_predictions' \\\n",
-        "  model.dataset.task='sgd' \\\n",
-        "  model.language_model.pretrained_model_name='gpt2' \\\n",
-        "  trainer.max_epochs=1 \\\n",
-        "  model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n",
-        "  exp_manager.create_wandb_logger=False)\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kGDlV5HvI2PQ"
-      },
-      "outputs": [],
-      "source": [
-        "!ls sgd_gpt2_predictions"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "p8g0f5KDTu9K"
-      },
-      "source": [
-        "**After 1 epoch:**\n",
-        "\n",
-        "More epochs would needed to reach convergence.\n",
-        "\n",
-        "\n",
-        "```\n",
-        "    label                                                precision    recall       f1           support   \n",
-        "    check balance (label_id: 0)                              0.00       0.00       0.00          0\n",
-        "    find trains (label_id: 1)                               80.20      91.95      85.68        348\n",
-        "    make payment (label_id: 2)                              83.12      28.07      41.97        228\n",
-        "    book appointment (label_id: 3)                          86.93      87.15      87.04        397\n",
-        "    get cars available (label_id: 4)                        96.88      90.51      93.58        274\n",
-        "    get event dates (label_id: 5)                            0.00       0.00       0.00          0\n",
-        "    buy bus ticket (label_id: 6)                            78.61      91.33      84.49        173\n",
-        "    add event (label_id: 7)                                  0.00       0.00       0.00          0\n",
-        "    get alarms (label_id: 8)                                58.33      77.78      66.67         45\n",
-        "    reserve car (label_id: 9)                               83.75      72.43      77.68        185\n",
-        "    get events (label_id: 10)                                0.00       0.00       0.00          0\n",
-        "    reserve roundtrip flights (label_id: 11)                 0.00       0.00       0.00          0\n",
-        "    lookup music (label_id: 12)                             89.83      86.89      88.33         61\n",
-        "    book house (label_id: 13)                               91.13      92.50      91.81        200\n",
-        "    search oneway flight (label_id: 14)                     74.77      47.70      58.25        174\n",
-        "    buy event tickets (label_id: 15)                        72.19      95.31      82.15        128\n",
-        "    find apartment (label_id: 16)                            0.00       0.00       0.00          0\n",
-        "    schedule visit (label_id: 17)                           77.27      66.06      71.23        386\n",
-        "    play media (label_id: 18)                               92.94      86.81      89.77         91\n",
-        "    get ride (label_id: 19)                                 99.41      98.82      99.12        170\n",
-        "    reserve oneway flight (label_id: 20)                     0.00       0.00       0.00          0\n",
-        "    find bus (label_id: 21)                                 96.64      87.53      91.86        361\n",
-        "    find restaurants (label_id: 22)                         77.14      91.22      83.59        148\n",
-        "    get times for movie (label_id: 23)                       0.00       0.00       0.00          0\n",
-        "    transfer money (label_id: 24)                            0.00       0.00       0.00          0\n",
-        "    request payment (label_id: 25)                          46.71      63.39      53.79        112\n",
-        "    play movie (label_id: 26)                              100.00      65.11      78.87        321\n",
-        "    search house (label_id: 27)                             97.91      91.83      94.77        306\n",
-        "    search roundtrip flights (label_id: 28)                 67.49      82.41      74.21        199\n",
-        "    find provider (label_id: 29)                            95.11      90.53      92.77        602\n",
-        "    find attractions (label_id: 30)                        100.00      89.01      94.19         91\n",
-        "    reserve hotel (label_id: 31)                            56.75      97.04      71.62        169\n",
-        "    lookup song (label_id: 32)                               0.00       0.00       0.00          0\n",
-        "    add alarm (label_id: 33)                                95.68      60.18      73.89        221\n",
-        "    find home by area (label_id: 34)                        48.95      59.79      53.83        194\n",
-        "    get available time (label_id: 35)                        0.00       0.00       0.00          0\n",
-        "    buy movie tickets (label_id: 36)                       100.00      29.39      45.42        473\n",
-        "    reserve restaurant (label_id: 37)                       95.71      84.80      89.92        342\n",
-        "    find movies (label_id: 38)                              62.40      97.61      76.14        335\n",
-        "    get weather (label_id: 39)                             100.00      87.69      93.44        195\n",
-        "    search hotel (label_id: 40)                             99.35      52.60      68.78        289\n",
-        "    find events (label_id: 41)                              99.57      82.56      90.27        281\n",
-        "    play song (label_id: 42)                                 0.00       0.00       0.00          0\n",
-        "    rent movie (label_id: 43)                                0.00       0.00       0.00          0\n",
-        "    get train tickets (label_id: 44)                        45.83       5.56       9.91        198\n",
-        "    none (label_id: 45)                                     55.77      98.90      71.32        728\n",
-        "    label_id: 46                                             0.00       0.00       0.00          0\n",
-        "    -------------------\n",
-        "    micro avg                                               77.23      77.23      77.23       8425\n",
-        "    macro avg                                               82.01      76.68      76.56       8425\n",
-        "    weighted avg                                            83.23      77.23      76.86       8425\n",
-        "\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jUJb-9VLLBXo"
-      },
-      "source": [
-        "# 3. MS Marco\n",
-        "\n",
-        "## Task Description\n",
-        "\n",
-        "MS Marco NLGen is a dataset from Microsoft that takes extracted answers and questions and output fluent answers.\n",
-        "\n",
-        "An example is \n",
-        "\n",
-        "\n",
-        "*   question: What county is Nine Mile in?\n",
-        "*   extracted_answer: Onondaga\n",
-        "*   fluent_answer: Nine Mile is in Onondaga county.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VtXEKG_UQU9u"
-      },
-      "source": [
-        "## Download and unzip files"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "b9avsZ1CEq3K"
-      },
-      "outputs": [],
-      "source": [
-        "!mkdir ms_marco\n",
-        "os.chdir('ms_marco')\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz\n",
-        "\n",
-        "!gunzip train_v2.1.json.gz\n",
-        "!gunzip dev_v2.1.json.gz\n",
-        "\n",
-        "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename train_v2.1.json \n",
-        "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename dev_v2.1.json \n",
-        "\n",
-        "os.chdir('..')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "h7UZ9R8gQTFo"
-      },
-      "source": [
-        "## Training and/or Testing the model\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fwGQCwbvRf2m"
-      },
-      "outputs": [],
-      "source": [
-        "# model.dataset.data_dir: folder to load data from\n",
-        "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
-        "\n",
-        "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n",
-        "  do_training=True \\\n",
-        "  model.dataset.dialogues_example_dir='./marco_bart_predictions' \\\n",
-        "  model.dataset.data_dir='./ms_marco' \\\n",
-        "  model.save_model=True \\\n",
-        "  model.dataset.debug_mode=True \\\n",
-        "  model.dataset.task='ms_marco' \\\n",
-        "  model.language_model.pretrained_model_name='facebook/bart-base' \\\n",
-        "  trainer.max_epochs=1 \\\n",
-        "  model.dataset.debug_mode=False \\\n",
-        "  exp_manager.create_wandb_logger=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UL7ekAOZ2abi"
-      },
-      "source": [
-        "**After 1 epoch:**\n",
-        "\n",
-        "Train more epochs for optimal performance\n",
-        "\n",
-        "```\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "       Test metric             DataLoader 0\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "          bleu               65.46179962158203\n",
-        "           f1                78.24439835896995\n",
-        "        precision            81.92473076099847\n",
-        "         recall              76.72508929408436\n",
-        "      test_accuracy         25.563487607283225\n",
-        "        test_loss           0.4419259166606655\n",
-        "     test_loss_epoch        0.4420809745788574\n",
-        "        test_ppl            1.5557004846779854\n",
-        "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
-        "```"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Dialogue.ipynb",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.7.7"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/tutorials/nlp/Entity_Linking_Medical.ipynb b/tutorials/nlp/Entity_Linking_Medical.ipynb
deleted file mode 100644
index dfdf594e6804..000000000000
--- a/tutorials/nlp/Entity_Linking_Medical.ipynb
+++ /dev/null
@@ -1,632 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-    "\n",
-    "Instructions for setting up Colab are as follows:\n",
-    "1. Open a new Python 3 notebook.\n",
-    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-    "4. Run this cell to set up dependencies.\n",
-    "\"\"\"\n",
-    "\n",
-    "## Install NeMo if using google collab or if its not installed locally\n",
-    "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Install dependencies\n",
-    "!pip install wget\n",
-    "!pip install faiss-gpu"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import faiss\n",
-    "import torch\n",
-    "import wget\n",
-    "import os\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "from omegaconf import OmegaConf\n",
-    "from pytorch_lightning import Trainer\n",
-    "from IPython.display import display\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "from nemo.collections import nlp as nemo_nlp\n",
-    "from nemo.utils.exp_manager import exp_manager"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Entity Linking"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Task Description\n",
-    "[Entity linking](https://en.wikipedia.org/wiki/Entity_linking) is the process of connecting concepts mentioned in natural language to their canonical forms stored in a knowledge base. For example, say a knowledge base contained the entity 'ID3452 influenza' and we wanted to process some natural language containing the sentence \"The patient has flu like symptoms\". An entity linking model would match the word 'flu' to the knowledge base entity 'ID3452 influenza', allowing for disambiguation and normalization of concepts referenced in text. Entity linking applications range from helping automate data ingestion to assisting in real time dialogue concept normalization. We will be focusing on entity linking in the medical domain for this demo, but the entity linking model, dataset, and training code within NVIDIA NeMo can be applied to other domains like finance and retail.\n",
-    "\n",
-    "Within NeMo and this tutorial we use the entity linking approach described in Liu et. al's NAACL 2021 \"[Self-alignment Pre-training for Biomedical Entity Representations](https://arxiv.org/abs/2010.11784v2)\". The main idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then be used to build a knowledge base embedding index. This index stores concept IDs mapped to their respective concept embeddings in a format conducive to efficient nearest neighbor search. We can link query concepts to their canonical forms in the knowledge base by performing a nearest neighbor search- matching concept query embeddings to the most similar concepts embeddings in the knowledge base index. \n",
-    "\n",
-    "In this tutorial we will be using the [faiss](https://github.com/facebookresearch/faiss) library to build our concept index."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Self Alignment Pretraining\n",
-    "Self-Alignment pretraining is a second stage pretraining of an existing encoder (called second stage because the encoder model can be further finetuned after this more general pretraining step). The dataset used during training consists of pairs of concept synonyms that map to the same ID. At each training iteration, we only select *hard* examples present in the mini batch to calculate the loss and update the model weights. In this context, a hard example is an example where a concept is closer to an unrelated concept in the mini batch than it is to the synonym concept it is paired with by some margin. I encourage you to take a look at [section 2 of the paper](https://arxiv.org/pdf/2010.11784.pdf) for a more formal and in depth description of how hard examples are selected.\n",
-    "\n",
-    "We then use a [metric learning loss](https://openaccess.thecvf.com/content_CVPR_2019/papers/Wang_Multi-Similarity_Loss_With_General_Pair_Weighting_for_Deep_Metric_Learning_CVPR_2019_paper.pdf) calculated from the hard examples selected. This loss helps reshape the embedding space. The concept representation space is rearranged to be more suitable for entity matching via embedding cosine similarity. \n",
-    "\n",
-    "Now that we have idea of what's going on, let's get started!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Dataset Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Download data into project directory\n",
-    "PROJECT_DIR = \".\" #Change if you don't want the current directory to be the project dir\n",
-    "DATA_DIR = os.path.join(PROJECT_DIR, \"tiny_example_data\")\n",
-    "\n",
-    "if not os.path.isdir(os.path.join(DATA_DIR)):\n",
-    "    wget.download('https://dldata-public.s3.us-east-2.amazonaws.com/tiny_example_data.zip',\n",
-    "                  os.path.join(PROJECT_DIR, \"tiny_example_data.zip\"))\n",
-    "\n",
-    "    !unzip {PROJECT_DIR}/tiny_example_data.zip -d {PROJECT_DIR}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this tutorial we will be using a tiny toy dataset to demonstrate how to use NeMo's entity linking model functionality. The dataset includes synonyms for 12 medical concepts. Entity phrases with the same ID are synonyms for the same concept. For example, \"*chronic kidney failure*\", \"*gradual loss of kidney function*\", and \"*CKD*\" are all synonyms of concept ID 5. Here's the dataset before preprocessing:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "raw_data = pd.read_csv(os.path.join(DATA_DIR, \"tiny_example_dev_data.csv\"), names=[\"ID\", \"CONCEPT\"], index_col=False)\n",
-    "print(raw_data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We've already paired off the concepts for this dataset with the format `ID concept_synonym1 concept_synonym2`. Here are the first ten rows:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training_data = pd.read_table(os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\"), names=[\"ID\", \"CONCEPT_SYN1\", \"CONCEPT_SYN2\"], delimiter='\\t')\n",
-    "print(training_data.head(10))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Use the [Unified Medical Language System (UMLS)](https://www.nlm.nih.gov/research/umls/index.html) dataset for full medical domain entity linking training. The data contains over 9 million entities and is a table of medical concepts with their corresponding concept IDs (CUI). After [requesting a free license and making a UMLS Terminology Services (UTS) account](https://www.nlm.nih.gov/research/umls/index.html), the [entire UMLS dataset](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) can be downloaded from the NIH's website. If you've cloned the NeMo repo you can run the data processing script located in `examples/nlp/entity_linking/data/umls_dataset_processing.py` on the full dataset. This script will take in the initial table of UMLS concepts and produce a .tsv file with each row formatted as `CUI\\tconcept_synonym1\\tconcept_synonym2`. Once the UMLS dataset .RRF file is downloaded, the script can be run from the `examples/nlp/entity_linking` directory like so: \n",
-    "```\n",
-    "python data/umls_dataset_processing.py\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Model Training"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Second stage pretrain a BERT Base encoder on the self-alignment pretraining task (SAP) for improved entity linking. Using a GPU, the model should take 5 minutes or less to train on this example dataset and training progress will be output below the cell."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Download config\n",
-    "wget.download(f\"https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml\",\n",
-    "              os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n",
-    "\n",
-    "# Load in config file\n",
-    "cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n",
-    "\n",
-    "# Set config file variables\n",
-    "cfg.project_dir = PROJECT_DIR\n",
-    "cfg.model.nemo_path = os.path.join(PROJECT_DIR, \"tiny_example_sap_bert_model.nemo\")\n",
-    "cfg.model.train_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\")\n",
-    "cfg.model.validation_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_validation_pairs.tsv\")\n",
-    "\n",
-    "# remove distributed training flags\n",
-    "cfg.trainer.strategy = 'auto'\n",
-    "cfg.trainer.accelerator = 'auto'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize the trainer and model\n",
-    "trainer = Trainer(**cfg.trainer)\n",
-    "exp_manager(trainer, cfg.get(\"exp_manager\", None))\n",
-    "model = nemo_nlp.models.EntityLinkingModel(cfg=cfg.model, trainer=trainer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Train and save the model\n",
-    "trainer.fit(model)\n",
-    "model.save_to(cfg.model.nemo_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can run the script at `examples/nlp/entity_linking/self_alignment_pretraining.py` to train a model on a larger dataset. Run\n",
-    "\n",
-    "```\n",
-    "python self_alignment_pretraining.py project_dir=.\n",
-    "```\n",
-    "from the `examples/nlp/entity_linking` directory."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Model Evaluation\n",
-    "\n",
-    "Let's evaluate our freshly trained model and compare its performance with a BERT Base encoder that hasn't undergone self-alignment pretraining. We first need to restore our trained model and load our BERT Base Baseline model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
-    "\n",
-    "# Restore second stage pretrained model\n",
-    "sap_model_cfg = cfg\n",
-    "sap_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_index\")\n",
-    "sap_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n",
-    "sap_model = nemo_nlp.models.EntityLinkingModel.restore_from(sap_model_cfg.model.nemo_path).to(device)\n",
-    "\n",
-    "# Load original model\n",
-    "base_model_cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n",
-    "\n",
-    "# Set train/val datasets to None to avoid loading datasets associated with training\n",
-    "base_model_cfg.model.train_ds = None\n",
-    "base_model_cfg.model.validation_ds = None\n",
-    "base_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"base_model_index\")\n",
-    "base_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n",
-    "base_model = nemo_nlp.models.EntityLinkingModel(base_model_cfg.model).to(device)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We are going evaluate our model on a nearest neighbor task using top 1 and top 5 accuracies as our metric. We will be using a tiny example test knowledge base and test queries. For this evaluation we are going to be comparing every test query with every concept vector in our test set knowledge base. We will rank each item in the knowledge base by its cosine similarity with the test query. We'll then compare the IDs of the predicted most similar test knowledge base concepts with our ground truth query IDs to calculate top 1 and top 5 accuracies. For this metric higher is better."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Helper function to get data embeddings\n",
-    "def get_embeddings(model, dataloader):\n",
-    "    embeddings, cids = [], []\n",
-    "\n",
-    "    with torch.no_grad():\n",
-    "        for batch in tqdm(dataloader):\n",
-    "            input_ids, token_type_ids, attention_mask, batch_cids = batch\n",
-    "            batch_embeddings = model.forward(input_ids=input_ids.to(device), \n",
-    "                                             token_type_ids=token_type_ids.to(device), \n",
-    "                                             attention_mask=attention_mask.to(device))\n",
-    "\n",
-    "            # Accumulate index embeddings and their corresponding IDs\n",
-    "            embeddings.extend(batch_embeddings.cpu().detach().numpy())\n",
-    "            cids.extend(batch_cids)\n",
-    "            \n",
-    "    return embeddings, cids"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def evaluate(model, test_kb, test_queries, ks):\n",
-    "    # Initialize knowledge base and query data loaders\n",
-    "    test_kb_dataloader = model.setup_dataloader(test_kb, is_index_data=True)\n",
-    "    test_query_dataloader = model.setup_dataloader(test_queries, is_index_data=True)\n",
-    "    \n",
-    "    # Get knowledge base and query embeddings\n",
-    "    test_kb_embs, test_kb_cids = get_embeddings(model, test_kb_dataloader)\n",
-    "    test_query_embs, test_query_cids = get_embeddings(model, test_query_dataloader)\n",
-    "\n",
-    "    # Calculate the cosine distance between each query and knowledge base concept\n",
-    "    score_matrix = np.matmul(np.array(test_query_embs), np.array(test_kb_embs).T)\n",
-    "    accs = {k : 0 for k in ks}\n",
-    "    \n",
-    "    # Compare the knowledge base IDs of the knowledge base entities with \n",
-    "    # the smallest cosine distance from the query \n",
-    "    for query_idx in tqdm(range(len(test_query_cids))):\n",
-    "        query_emb = test_query_embs[query_idx]\n",
-    "        query_cid = test_query_cids[query_idx]\n",
-    "        query_scores = score_matrix[query_idx]\n",
-    "\n",
-    "        for k in ks:\n",
-    "            topk_idxs = np.argpartition(query_scores, -k)[-k:]\n",
-    "            topk_cids = [test_kb_cids[idx] for idx in topk_idxs]\n",
-    "            \n",
-    "            # If the correct query ID is among the top k closest kb IDs\n",
-    "            # the model correctly linked the entity\n",
-    "            match = int(query_cid in topk_cids)\n",
-    "            accs[k] += match\n",
-    "\n",
-    "    for k in ks:\n",
-    "        accs[k] /= len(test_query_cids)\n",
-    "                \n",
-    "    return accs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create configs for our test data\n",
-    "test_kb = OmegaConf.create({\n",
-    "    \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_kb.tsv\"),\n",
-    "    \"max_seq_length\": 128,\n",
-    "    \"batch_size\": 10,\n",
-    "    \"shuffle\": False,\n",
-    "})\n",
-    "\n",
-    "test_queries = OmegaConf.create({\n",
-    "    \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_queries.tsv\"),\n",
-    "    \"max_seq_length\": 128,\n",
-    "    \"batch_size\": 10,\n",
-    "    \"shuffle\": False,\n",
-    "})\n",
-    "\n",
-    "ks = [1, 5]\n",
-    "\n",
-    "# Evaluate both models on our test data\n",
-    "base_accs = evaluate(base_model, test_kb, test_queries, ks)\n",
-    "base_accs[\"Model\"] = \"BERT Base Baseline\"\n",
-    "\n",
-    "sap_accs = evaluate(sap_model, test_kb, test_queries, ks)\n",
-    "sap_accs[\"Model\"] = \"BERT + SAP\"\n",
-    "\n",
-    "print(\"Top 1 and Top 5 Accuracy Comparison:\")\n",
-    "results_df = pd.DataFrame([base_accs, sap_accs], columns=[\"Model\", 1, 5])\n",
-    "results_df = results_df.style.set_properties(**{'text-align': 'left', }).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])\n",
-    "display(results_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The purpose of this section was to show an example of evaluating your entity linking model. This evaluation set contains very little data, and no serious conclusions should be drawn about model performance. Top 1 accuracy should be between 0.7 and 1.0 for both models and top 5 accuracy should be between 0.8 and 1.0. When evaluating a model trained on a larger dataset, you can use a nearest neighbors index to speed up the evaluation time."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Building an Index"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To qualitatively observe the improvement we gain from the second stage pretraining, let's build two indices. One will be built with BERT base embeddings before self-alignment pretraining and one will be built with the model we just trained. Our knowledge base in this tutorial will be in the same domain and have some overlapping concepts as the training set. This data file is formatted as `ID\\tconcept`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The `EntityLinkingDataset` class can load the data used for training the entity linking encoder as well as for building the index if the `is_index_data` flag is set to true. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def build_index(cfg, model):\n",
-    "    # Setup index dataset loader\n",
-    "    index_dataloader = model.setup_dataloader(cfg.index.index_ds, is_index_data=True)\n",
-    "    \n",
-    "    # Get index dataset embeddings\n",
-    "    embeddings, _ = get_embeddings(model, index_dataloader)\n",
-    "    \n",
-    "    # Train IVFFlat index using faiss\n",
-    "    embeddings = np.array(embeddings)\n",
-    "    quantizer = faiss.IndexFlatL2(cfg.index.dims)\n",
-    "    index = faiss.IndexIVFFlat(quantizer, cfg.index.dims, cfg.index.nlist)\n",
-    "    index = faiss.index_cpu_to_all_gpus(index)\n",
-    "    index.train(embeddings)\n",
-    "    \n",
-    "    # Add concept embeddings to index\n",
-    "    for i in tqdm(range(0, embeddings.shape[0], cfg.index.index_batch_size)):\n",
-    "            index.add(embeddings[i:i+cfg.index.index_batch_size])\n",
-    "\n",
-    "    # Save index\n",
-    "    faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index.index_save_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "build_index(sap_model_cfg, sap_model.to(device))\n",
-    "build_index(base_model_cfg, base_model.to(device))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Entity Linking via Nearest Neighbor Search"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now it's time to query our indices! We are going to query both our index built with embeddings from BERT Base, and our index with embeddings built from the SAP BERT model we trained. Our sample query phrases will be \"*high blood sugar*\" and \"*head pain*\". \n",
-    "\n",
-    "To query our indices, we first need to get the embedding of each query from the corresponding encoder model. We can then pass these query embeddings into the faiss index which will perform a nearest neighbor search, using cosine distance to compare the query embedding with embeddings present in the index. Once we get a list of knowledge base index concept IDs most closely matching our query, all that is left to do is map the IDs to a representative string describing the concept. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def query_index(cfg, model, index, queries, id2string):\n",
-    "    # Get query embeddings from our entity linking encoder model\n",
-    "    query_embs = get_query_embedding(queries, model).cpu().detach().numpy()\n",
-    "    \n",
-    "    # Use query embedding to find closest concept embedding in knowledge base\n",
-    "    distances, neighbors = index.search(query_embs, cfg.index.top_n)\n",
-    "    \n",
-    "    # Get the canonical strings corresponding to the IDs of the query's nearest neighbors in the kb \n",
-    "    neighbor_concepts = [[id2string[concept_id] for concept_id in query_neighbor] \\\n",
-    "                                                for query_neighbor in neighbors]\n",
-    "    \n",
-    "    # Display most similar concepts in the knowledge base. \n",
-    "    for query_idx in range(len(queries)):\n",
-    "        print(f\"\\nThe most similar concepts to {queries[query_idx]} are:\")\n",
-    "        for cid, concept, dist in zip(neighbors[query_idx], neighbor_concepts[query_idx], distances[query_idx]):\n",
-    "            print(cid, concept, 1 - dist)\n",
-    "\n",
-    "    \n",
-    "def get_query_embedding(queries, model):\n",
-    "    # Tokenize our queries\n",
-    "    model_input =  model.tokenizer(queries,\n",
-    "                                   add_special_tokens = True,\n",
-    "                                   padding = True,\n",
-    "                                   truncation = True,\n",
-    "                                   max_length = 512,\n",
-    "                                   return_token_type_ids = True,\n",
-    "                                   return_attention_mask = True)\n",
-    "    \n",
-    "    # Pass tokenized input into model\n",
-    "    query_emb =  model.forward(input_ids=torch.LongTensor(model_input[\"input_ids\"]).to(device),\n",
-    "                               token_type_ids=torch.LongTensor(model_input[\"token_type_ids\"]).to(device),\n",
-    "                               attention_mask=torch.LongTensor(model_input[\"attention_mask\"]).to(device))\n",
-    "    \n",
-    "    return query_emb"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load indices\n",
-    "sap_index = faiss.read_index(sap_model_cfg.index.index_save_name)\n",
-    "base_index = faiss.read_index(base_model_cfg.index.index_save_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Map concept IDs to one canonical string\n",
-    "index_data = open(sap_model_cfg.index.index_ds.data_file, \"r\", encoding='utf-8-sig')\n",
-    "id2string = {}\n",
-    "\n",
-    "for line in index_data:\n",
-    "    cid, concept = line.split(\"\\t\")\n",
-    "    id2string[int(cid) - 1] = concept.strip()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "id2string"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Some sample queries\n",
-    "queries = [\"high blood sugar\", \"head pain\"]\n",
-    "\n",
-    "# Query BERT Base\n",
-    "print(\"BERT Base output before Self Alignment Pretraining:\")\n",
-    "query_index(base_model_cfg, base_model, base_index, queries, id2string)\n",
-    "print(\"\\n\" + \"-\" * 50 + \"\\n\")\n",
-    "\n",
-    "# Query SAP BERT\n",
-    "print(\"SAP BERT output after Self Alignment Pretraining:\")\n",
-    "query_index(sap_model_cfg, sap_model, sap_index, queries, id2string)\n",
-    "print(\"\\n\" + \"-\" * 50 + \"\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Even after only training on this tiny amount of data, the qualitative performance boost from self-alignment pretraining is visible. The baseline model links \"*high blood sugar*\" to the entity \"*6 diabetes*\" while our SAP BERT model accurately links \"*high blood sugar*\" to \"*Hyperinsulinemia*\". Similarly, \"*head pain*\" and \"*Myocardial infraction*\" are not the same concept, but \"*head pain*\" and \"*Headache*\" are."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For larger knowledge bases keeping the default embedding size might be too large and cause out of memory issues. You can apply PCA or some other dimensionality reduction method to your data to reduce its memory footprint. Code for creating a text file of all the UMLS entities in the correct format needed to build an index and creating a dictionary mapping concept ids to canonical concept strings can be found here `examples/nlp/entity_linking/data/umls_dataset_processing.py`. \n",
-    "\n",
-    "The code for extracting knowledge base concept embeddings, training and applying a PCA transformation to the embeddings, building a faiss index and querying the index from the command line is located at `examples/nlp/entity_linking/build_index.py` and `examples/nlp/entity_linking/query_index.py`. \n",
-    "\n",
-    "If you've cloned the NeMo repo, both of these steps can be run as follows on the command line from the `examples/nlp/entity_linking/` directory.\n",
-    "\n",
-    "```\n",
-    "python data/umls_dataset_processing.py --index\n",
-    "python build_index.py --restore\n",
-    "python query_index.py --restore\n",
-    "```\n",
-    "By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=<PATH>` after each of the above commands. Intermediate steps of the index building process are saved. In the occurrence of an error, previously completed steps do not need to be rerun. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Command Recap"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is a recap of the commands and steps to repeat this process on the full UMLS dataset. \n",
-    "\n",
-    "1) Download the UMLS dataset file `MRCONSO.RRF` from the NIH website and place it in the `examples/nlp/entity_linking/data` directory.\n",
-    "\n",
-    "2) Run the following commands from the `examples/nlp/entity_linking` directory\n",
-    "```\n",
-    "python data/umls_dataset_processing.py\n",
-    "python self_alignment_pretraining.py project_dir=. \n",
-    "python data/umls_dataset_processing.py --index\n",
-    "python build_index.py --restore\n",
-    "python query_index.py --restore\n",
-    "```\n",
-    "The model will take ~24hrs to train on two GPUs and ~48hrs to train on one GPU. By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=<PATH>` after each of the above commands and changing `project_dir=<PATH>` in the `self_alignment_pretraining.py` command. If you change the project directory, you should also move the `MRCONOSO.RRF` file to a `data` sub directory within the one you've specified. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As mentioned in the introduction, entity linking within NVIDIA NeMo is not limited to the medical domain. The same data processing and training steps can be applied to a variety of domains and use cases. You can edit the datasets used as well as training and loss function hyperparameters within your config file to better suit your domain."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/tutorials/nlp/GLUE_Benchmark.ipynb b/tutorials/nlp/GLUE_Benchmark.ipynb
deleted file mode 100644
index b77b3439b444..000000000000
--- a/tutorials/nlp/GLUE_Benchmark.ipynb
+++ /dev/null
@@ -1,566 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "GLUE_Benchmark.ipynb",
-      "provenance": [],
-      "private_outputs": true,
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU",
-    "pycharm": {
-      "stem_cell": {
-        "cell_type": "raw",
-        "source": [],
-        "metadata": {
-          "collapsed": false
-        }
-      }
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "o_0K1lsW1dj9",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "\"\"\"\n",
-        "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-        "\n",
-        "Instructions for setting up Colab are as follows:\n",
-        "1. Open a new Python 3 notebook.\n",
-        "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-        "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-        "4. Run this cell to set up dependencies.\n",
-        "\"\"\"\n",
-        "# If you're using Google Colab and not running locally, run this cell\n",
-        "\n",
-        "# install NeMo\n",
-        "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "pycharm": {
-          "name": "#%%\n"
-        },
-        "id": "JFWG-jYCfvD7",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# If you're not using Colab, you might need to upgrade jupyter notebook to avoid the following error:\n",
-        "# 'ImportError: IProgress not found. Please update jupyter and ipywidgets.'\n",
-        "\n",
-        "! pip install ipywidgets\n",
-        "! jupyter nbextension enable --py widgetsnbextension\n",
-        "\n",
-        "# Please restart the kernel after running this cell"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "dzqD2WDFOIN-",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "from nemo.collections import nlp as nemo_nlp\n",
-        "from nemo.utils.exp_manager import exp_manager\n",
-        "\n",
-        "import os\n",
-        "import wget \n",
-        "import torch\n",
-        "import pytorch_lightning as pl\n",
-        "from omegaconf import OmegaConf"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "daYw_Xll2ZR9",
-        "colab_type": "text"
-      },
-      "source": [
-        "In this tutorial, we are going to describe how to finetune a BERT-like model based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) on [GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding](https://openreview.net/pdf?id=rJ4km2R5t7). \n",
-        "\n",
-        "# GLUE tasks\n",
-        "GLUE Benchmark includes 9 natural language understanding tasks:\n",
-        "\n",
-        "## Single-Sentence Tasks\n",
-        "\n",
-        "* CoLA - [The Corpus of Linguistic Acceptability](https://arxiv.org/abs/1805.12471) is a set of English sentences from published linguistics literature. The task is to predict whether a given sentence is grammatically correct or not.\n",
-        "* SST-2 - [The Stanford Sentiment Treebank](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence: positive or negative.\n",
-        "\n",
-        "## Similarity and Paraphrase tasks\n",
-        "\n",
-        "* MRPC - [The Microsoft Research Paraphrase Corpus](https://www.aclweb.org/anthology/I05-5002.pdf) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n",
-        "* QQP - [The Quora Question Pairs](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.\n",
-        "* STS-B - [The Semantic Textual Similarity Benchmark](https://arxiv.org/abs/1708.00055) is a collection of sentence pairs drawn from news headlines, video, and image captions, and natural language inference data. The task is to determine how similar two sentences are.\n",
-        "\n",
-        "## Inference Tasks\n",
-        "\n",
-        "* MNLI - [The Multi-Genre Natural Language Inference Corpus](https://cims.nyu.edu/~sbowman/multinli/multinli_0.9.pdf) is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The task has the matched (in-domain) and mismatched (cross-domain) sections.\n",
-        "* QNLI - [The Stanford Question Answering Dataset](https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf) is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question. The task is to determine whether the context sentence contains the answer to the question.\n",
-        "* RTE The Recognizing Textual Entailment (RTE) datasets come from a series of annual [textual entailment challenges](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment). The task is to determine whether the second sentence is the entailment of the first one or not.\n",
-        "* WNLI - The Winograd Schema Challenge is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices (Hector Levesque, Ernest Davis, and Leora Morgenstern. The winograd schema challenge. In Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning. 2012).\n",
-        "\n",
-        "All tasks are classification tasks, except for the STS-B task which is a regression task. All classification tasks are 2-class problems, except for the MNLI task which has 3-classes.\n",
-        "\n",
-        "More details about GLUE benchmark could be found [here](https://gluebenchmark.com/)."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZnuziSwJ1yEB",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Datasets\n",
-        "\n",
-        "**To proceed further, you need to download the GLUE data.** For example, you can download [this script](https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py) using `wget` and then execute it by running:\n",
-        "\n",
-        "`python download_glue_data.py`\n",
-        "\n",
-        "use `--tasks TASK` if datasets for only selected GLUE tasks are needed\n",
-        "\n",
-        "After running the above commands, you will have a folder `glue_data` with data folders for every GLUE task. For example, data for MRPC task would be under glue_data/MRPC.\n",
-        "\n",
-        "This tutorial and [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py) work with all GLUE tasks without any modifications. For this tutorial, we are going to use MRPC task.\n",
-        "\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "--wJ2891aIIE",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# supported task names: [\"cola\", \"sst-2\", \"mrpc\", \"sts-b\", \"qqp\", \"mnli\", \"qnli\", \"rte\", \"wnli\"]\n",
-        "TASK = 'mrpc'\n",
-        "DATA_DIR = 'glue_data/MRPC'\n",
-        "WORK_DIR = \"WORK_DIR\"\n",
-        "MODEL_CONFIG = 'glue_benchmark_config.yaml'"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "qB0oLE4R9EhJ",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "! ls -l $DATA_DIR"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gMWuU69pbUDe",
-        "colab_type": "text"
-      },
-      "source": [
-        "For each task, there are 3 files: `train.tsv, dev.tsv, and test.tsv`. Note, MNLI has 2 dev sets: matched and mismatched, evaluation on both dev sets will be done automatically."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "6UDPgadLN6SG",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# let's take a look at the training data \n",
-        "! head -n 5 {DATA_DIR}/train.tsv"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_whKCxfTMo6Y",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Model configuration\n",
-        "\n",
-        "Now, let's take a closer look at the model's configuration and learn to train the model.\n",
-        "\n",
-        "GLUE model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model followed by a Sequence Regression module (for STS-B task) or  Sequence classifier module (for the rest of the tasks).\n",
-        "\n",
-        "The model is defined in a config file which declares multiple important sections. They are:\n",
-        "- **model**: All arguments that are related to the Model - language model, a classifier, optimizer and schedulers, datasets and any other related information\n",
-        "\n",
-        "- **trainer**: Any argument to be passed to PyTorch Lightning"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "T1gA8PsJ13MJ",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# download the model's configuration file \n",
-        "config_dir = WORK_DIR + '/configs/'\n",
-        "os.makedirs(config_dir, exist_ok=True)\n",
-        "if not os.path.exists(config_dir + MODEL_CONFIG):\n",
-        "    print('Downloading config file...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/glue_benchmark/' + MODEL_CONFIG, config_dir)\n",
-        "else:\n",
-        "    print ('config file is already exists')"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "mX3KmWMvSUQw",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# this line will print the entire config of the model\n",
-        "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n",
-        "print(config_path)\n",
-        "config = OmegaConf.load(config_path)\n",
-        "print(OmegaConf.to_yaml(config))"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZCgWzNBkaQLZ",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Model Training\n",
-        "## Setting up Data within the config\n",
-        "\n",
-        "Among other things, the config file contains dictionaries called **dataset**, **train_ds** and **validation_ds**. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n",
-        "\n",
-        "We assume that both training and evaluation files are located in the same directory, and use the default names mentioned during the data download step. \n",
-        "So, to start model training, we simply need to specify `model.dataset.data_dir`, like we are going to do below.\n",
-        "\n",
-        "Also notice that some config lines, including `model.dataset.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n",
-        "\n",
-        "Let's now add the data directory path, task name and output directory for saving predictions to the config."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "LQHCJN-ZaoLp",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "config.model.task_name = TASK\n",
-        "config.model.output_dir = WORK_DIR\n",
-        "config.model.dataset.data_dir = DATA_DIR"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nB96-3sTc3yk",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Building the PyTorch Lightning Trainer\n",
-        "\n",
-        "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem.\n",
-        "\n",
-        "Let's first instantiate a Trainer object"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "1tG4FzZ4Ui60",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "print(\"Trainer config - \\n\")\n",
-        "print(OmegaConf.to_yaml(config.trainer))"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "knF6QeQQdMrH",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# lets modify some trainer configs\n",
-        "# checks if we have GPU available and uses it\n",
-        "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-        "config.trainer.devices = 1\n",
-        "config.trainer.accelerator = accelerator\n",
-        "\n",
-        "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n",
-        "\n",
-        "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n",
-        "# config.trainer.amp_level = O1\n",
-        "\n",
-        "# remove distributed training flags\n",
-        "config.trainer.strategy = 'auto'\n",
-        "\n",
-        "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n",
-        "config.trainer.max_steps = 128\n",
-        "\n",
-        "trainer = pl.Trainer(**config.trainer)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8IlEMdVxdr6p",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Setting up a NeMo Experiment\n",
-        "\n",
-        "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "8uztqGAmdrYt",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n",
-        "\n",
-        "# the exp_dir provides a path to the current experiment for easy access\n",
-        "exp_dir = str(exp_dir)\n",
-        "exp_dir"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8tjLhUvL_o7_",
-        "colab_type": "text"
-      },
-      "source": [
-        "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model and use [Megatron-LM BERT](https://arxiv.org/abs/1909.08053) or [AlBERT model](https://arxiv.org/abs/1909.11942):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Xeuc2i7Y_nP5",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# get the list of supported BERT-like models, for the complete list of HugginFace models, see https://huggingface.co/models\n",
-        "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=True))\n",
-        "\n",
-        "# specify BERT-like model, you want to use, for example, \"megatron-bert-345m-uncased\" or 'bert-base-uncased'\n",
-        "PRETRAINED_BERT_MODEL = \"albert-base-v1\""
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "RK2xglXyAUOO",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# add the specified above model parameters to the config\n",
-        "config.model.language_model.pretrained_model_name = PRETRAINED_BERT_MODEL"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fzNZNAVRjDD-",
-        "colab_type": "text"
-      },
-      "source": [
-        "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n",
-        "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "NgsGLydWo-6-",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "model = nemo_nlp.models.GLUEModel(cfg=config.model, trainer=trainer)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kQ592Tx4pzyB",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Monitoring training progress\n",
-        "Optionally, you can create a Tensorboard visualization to monitor training progress."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "mTJr16_pp0aS",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "try:\n",
-        "  from google import colab\n",
-        "  COLAB_ENV = True\n",
-        "except (ImportError, ModuleNotFoundError):\n",
-        "  COLAB_ENV = False\n",
-        "\n",
-        "# Load the TensorBoard notebook extension\n",
-        "if COLAB_ENV:\n",
-        "  %load_ext tensorboard\n",
-        "  %tensorboard --logdir {exp_dir}\n",
-        "else:\n",
-        "  print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "CFgAlaIdndjW",
-        "colab_type": "text"
-      },
-      "source": [
-        "Note, it’s recommended to finetune the model on each task separately. Also, based on [GLUE Benchmark FAQ#12](https://gluebenchmark.com/faq), there are might be some differences in dev/test distributions for QQP task and in train/dev for WNLI task."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "hUvnSpyjp0Dh",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# start model training\n",
-        "trainer.fit(model)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ref1qSonGNhP",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Training Script\n",
-        "\n",
-        "If you have NeMo installed locally, you can also train the model with [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py).\n",
-        "\n",
-        "To run training script, use:\n",
-        "\n",
-        "`python glue_benchmark.py \\\n",
-        " model.dataset.data_dir=PATH_TO_DATA_DIR \\\n",
-        " model.task_name=TASK`\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KVPFofXaoKNE",
-        "colab_type": "text"
-      },
-      "source": [
-        "Average results after 3 runs:\n",
-        "\n",
-        "| Task  |         Metric           | ALBERT-large | ALBERT-xlarge | Megatron-345m | BERT base paper | BERT large paper |\n",
-        "|-------|--------------------------|--------------|---------------|---------------|-----------------|------------------|\n",
-        "| CoLA  | Matthew's correlation    |     54.94    |     61.72     |     64.56     |      52.1       |       60.5       |\n",
-        "| SST-2 | Accuracy                 |     92.74    |     91.86     |     95.87     |      93.5       |       94.9       |\n",
-        "| MRPC  | F1/Accuracy              |  92.05/88.97 |  91.87/88.61  |  92.36/89.46  |      88.9/-     |     89.3/-       |\n",
-        "| STS-B | Person/Spearman corr.    |  90.41/90.21 |  90.07/90.10  |  91.51/91.61  |     -/85.8      |      -/86.5      |\n",
-        "| QQP   | F1/Accuracy              |  88.26/91.26 |  88.80/91.65  |  89.18/91.91  |     71.2/-      |     72.1/-       |\n",
-        "| MNLI  | Matched /Mismatched acc. |  86.69/86.81 |  88.66/88.73  |  89.86/89.81  |    84.6/83.4    |     86.7/85.9    |\n",
-        "| QNLI  | Accuracy                 |     92.68    |     93.66     |     94.33     |      90.5       |       92.7       |\n",
-        "| RTE   | Accuracy                 |     80.87    |     82.86     |     83.39     |      66.4       |       70.1       |\n",
-        "\n",
-        "WNLI task was excluded from the experiments due to the problematic WNLI set.\n",
-        "The dev sets were used for evaluation for ALBERT and Megatron models, and the test sets results for [the BERT paper](https://arxiv.org/abs/1810.04805).\n",
-        "\n",
-        "Hyperparameters used to get the results from the above table, could be found in the table below. Some tasks could be further finetuned to improve performance numbers, the tables are for a baseline reference only.\n",
-        "Each cell in the table represents the following parameters:\n",
-        "Number of GPUs used/ Batch Size/ Learning Rate/ Number of Epochs. For not specified parameters, please refer to the default parameters in the training script.\n",
-        "\n",
-        "| Task  | ALBERT-large | ALBERT-xlarge | Megatron-345m |\n",
-        "|-------|--------------|---------------|---------------|\n",
-        "| CoLA  | 1 / 32 / 1e-5 / 3  |  1 / 32 / 1e-5 / 10 |  4 / 16 / 2e-5 / 12 |\n",
-        "| SST-2 | 4 / 16 / 2e-5 / 5  |  4 / 16 / 2e-5 /12  |  4 / 16 / 2e-5 / 12 |\n",
-        "| MRPC  | 1 / 32 / 1e-5 / 5  |  1 / 16 / 2e-5 / 5  |  1 / 16 / 2e-5 / 10 |\n",
-        "| STS-B | 1 / 16 / 2e-5 / 5  |  1 / 16 / 4e-5 / 12 |  4 / 16 / 3e-5 / 12 |\n",
-        "| QQP   | 1 / 16 / 2e-5 / 5  | 4 / 16 / 1e-5 / 12  |  4 / 16 / 1e-5 / 12 |\n",
-        "| MNLI  | 4 / 64 / 1e-5 / 5  |  4 / 32 / 1e-5 / 5  |  4 / 32 / 1e-5 / 5  | \n",
-        "| QNLI  | 4 / 16 / 1e-5 / 5  |  4 / 16 / 1e-5 / 5  |  4 / 16 / 2e-5 / 5  | \n",
-        "| RTE   | 1 / 16 / 1e-5 / 5  | 1 / 16 / 1e-5 / 12  |  4 / 16 / 3e-5 / 12 |\n"
-      ]
-    }
-  ]
-}
diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb
deleted file mode 100644
index c19c07b67005..000000000000
--- a/tutorials/nlp/MegatronBert_export.ipynb
+++ /dev/null
@@ -1,280 +0,0 @@
-{
-    "cells": [
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "8046e96a",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "BRANCH='main'"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "38bfe8ea",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "\"\"\"\n",
-                "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-                "\n",
-                "Instructions for setting up Colab are as follows:\n",
-                "1. Open a new Python 3 notebook.\n",
-                "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-                "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-                "4. Run this cell to set up dependencies.\n",
-                "\"\"\"\n",
-                "# If you're using Google Colab and not running locally, run this cell\n",
-                "\n",
-                "# install NeMo\n",
-                "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "98c00a93",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import os\n",
-                "import wget \n",
-                "import torch\n",
-                "import pytorch_lightning as pl\n",
-                "from omegaconf import OmegaConf"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "e9fb1a66",
-            "metadata": {},
-            "source": [
-                "### Deprecation Notice\n",
-                "\n",
-                "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n",
-                "\n",
-                "---\n",
-                "\n",
-                "# Task Description\n",
-                "In this tutorial, we are going to describe how to export NeMo NLP models with BERT based models as the pre-trained model."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "dd0fb016",
-            "metadata": {},
-            "source": [
-                "## Convert the Megatron-LM Weights to Nemo file\n",
-                "\n",
-                "If you prefer to use the Huggingface BERT models, please skip this section and refer to `Setting up a NeMo Experiment` section to load a model from `nemo_nlp.modules.get_pretrained_lm_models_list()`\n",
-                "\n",
-                "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "e451f219",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from nemo.collections.nlp.modules.common.megatron.megatron_utils import MEGATRON_CONFIG_MAP\n",
-                "import pathlib\n",
-                "\n",
-                "PRETRAINED_BERT_MODEL = \"megatron-bert-345m-uncased\"  # specify BERT-like model from MEGATRON_CONFIG_MAP.keys()\n",
-                "nemo_out_path = \"qa_pretrained.nemo\" # the nemo output file name\n",
-                "\n",
-                "checkpoint_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['checkpoint']\n",
-                "vocab_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['vocab']\n",
-                "checkpoint_filename = pathlib.Path(checkpoint_url).name\n",
-                "vocab_filename = pathlib.Path(vocab_url).name\n",
-                "if not pathlib.Path(checkpoint_filename).exists():\n",
-                "    print('downloading from checkpoint url', checkpoint_url)\n",
-                "    !wget $checkpoint_url\n",
-                "if not pathlib.Path(vocab_filename).exists():\n",
-                "    print('downloading from vocab url', vocab_url)\n",
-                "    !wget $vocab_url"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "7586b5c0",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "WORK_DIR = \"WORK_DIR\"\n",
-                "os.makedirs(WORK_DIR, exist_ok=True)\n",
-                "\n",
-                "# Prepare the model parameters \n",
-                "# download the model's configuration file \n",
-                "config_dir = WORK_DIR + '/configs/'\n",
-                "MODEL_CONFIG = \"megatron_bert_config.yaml\"\n",
-                "os.makedirs(config_dir, exist_ok=True)\n",
-                "if not os.path.exists(config_dir + MODEL_CONFIG):\n",
-                "    print('Downloading config file...')\n",
-                "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/conf/' + MODEL_CONFIG, config_dir)\n",
-                "else:\n",
-                "    print ('config file is already exists')"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "e0dd3124",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# this line will print the entire config of the model\n",
-                "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n",
-                "print(config_path)\n",
-                "config = OmegaConf.load(config_path)\n",
-                "\n",
-                "config.model.megatron_legacy = True # set to true if you trained the NLP model on NeMo < 1.5.0\n",
-                "config.model.bias_gelu_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export \n",
-                "config.model.masked_softmax_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export\n",
-                "\n",
-                "config.model.num_layers = 24\n",
-                "config.model.hidden_size = 1024\n",
-                "config.model.ffn_hidden_size = 4096\n",
-                "config.model.num_attention_heads = 16\n",
-                "config.model.tokenizer.vocab_file = vocab_filename\n",
-                "config.model.tokenizer.type = 'BertWordPieceLowerCase' # change this to BertWordPieceCase if you are using a cased pretrained model\n",
-                "config.model.tensor_model_parallel_size = 1\n",
-                "config.model.data.data_prefix = ''\n",
-                "config.model.max_position_embeddings = 512\n",
-                "config.model.data.seq_length = 512\n",
-                "config.cfg = {}\n",
-                "config.cfg.cfg = config.model\n",
-                "with open('hparams.yaml', 'w') as f:\n",
-                "    f.write(OmegaConf.to_yaml(config.cfg))\n",
-                "if(config.model.megatron_legacy):\n",
-                "    checkpoint_filename = \"model_optim_rng_ca.pt\" #provide path to the pretrained pt file you used during training on NeMo < 1.5.0, for NeMo >= 1.5.0\n",
-                "print(checkpoint_filename)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "47dca6de",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import os\n",
-                "PWD = os.getcwd()\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py')\n",
-                "!python -m torch.distributed.run --nproc_per_node=1 megatron_lm_ckpt_to_nemo.py --checkpoint_folder=$PWD --checkpoint_name=$checkpoint_filename --hparams_file=$PWD/hparams.yaml --nemo_file_path=$PWD/$nemo_out_path --model_type=bert --tensor_model_parallel_size=1"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "1ae8d31b",
-            "metadata": {},
-            "source": [
-                "# Legacy NLP Bert based model conversion\n",
-                "\n",
-                "Step 1: Convert legacy nemo checkpoint to a checkpoint which is currently supported by nemo\n",
-                "\n",
-                "Step 2: Use the converted model from step 1 to export the nemo file to the required format"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "86639a3d",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/nemo_legacy_import/nlp_checkpoint_port.py')\n",
-                "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/export.py')"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "48820d57",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "legacy_nemo_file_path = \"/NeMo/megatron_multiqa.nemo\" #path to you model trained on NeMo < 1.5\n",
-                "nemo_converted_out_path = \"converted_megatron_multiqa.nemo\"\n",
-                "megatron_absolute_language_model_path = \"/NeMo/tutorials/nlp/qa_pretrained.nemo\" # Give the absolute path of the model you obtained using megatron_lm_ckpt_to_nemo\n",
-                "onnx_export_out_path = \"onnx_megatron_multiqa.onnx\""
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "7191e0cb",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "os.system(f\"python nlp_checkpoint_port.py {legacy_nemo_file_path} {nemo_converted_out_path} --megatron-legacy=True --megatron-checkpoint {megatron_absolute_language_model_path}\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "ccc720ef",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "os.system(f\"python export.py {nemo_converted_out_path} {onnx_export_out_path} --autocast --runtime-check\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "f10461f2",
-            "metadata": {},
-            "source": [
-                "# Convert a NLP model with BERT based pre-trained model trained on NeMo >= 1.5.0\n",
-                "\n",
-                "For models trained on NeMo >= 1.5.0, you just run the export script and skip the legacy conversion part"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "0514ab37",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "nemo_file_path = \"\"\n",
-                "onnx_export_out_path = "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "1d6b5db4",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "python export.py $nemo_converted_out_path $onnx_export_out_path --autocast --runtime-check"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.12"
-        }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-}
diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb
deleted file mode 100644
index 054928245d9d..000000000000
--- a/tutorials/nlp/Question_Answering.ipynb
+++ /dev/null
@@ -1,1163 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tiIOhb7iVC3J"
-      },
-      "source": [
-        "# Overview"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PucJwfbhVC3L"
-      },
-      "source": [
-        "### Deprecation Notice\n",
-        "\n",
-        "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n",
-        "\n",
-        "---\n",
-        "\n",
-        "This tutorial will demonstrate how to train, evaluate, and test three types of models for Question-Answering -\n",
-        "1. BERT-like models for Extractive Question-Answering\n",
-        "2. Sequence-to-Sequence (S2S) models for Generative Question-Answering (ex. T5/BART-like)\n",
-        "3. GPT-like models for Generative Question-Answering\n",
-        "\n",
-        "## Task Description\n",
-        "\n",
-        "- Given a context and a natural language query, we want to generate an answer for the query\n",
-        "- Depending on how the answer is generated, the task can be broadly divided into two types:\n",
-        "    1. Extractive Question Answering\n",
-        "    2. Generative Question Answering\n",
-        "\n",
-        "\n",
-        "### Extractive Question-Answering with BERT-like models\n",
-        "\n",
-        "Given a question and a context, both in natural language, predict the span within the context with a start and end position which indicates the answer to the question.\n",
-        "For every word in our training dataset we’re going to predict:\n",
-        "- likelihood this word is the start of the span \n",
-        "- likelihood this word is the end of the span\n",
-        "\n",
-        "We are using a BERT encoder with 2 span prediction heads for predicting start and end position of the answer. The span predictions are token classifiers consisting of a single linear layer.\n",
-        "\n",
-        "### Generative Question-Answering with S2S and GPT-like models\n",
-        "\n",
-        "Given a question and a context, both in natural language, generate an answer for the question. Unlike the BERT-like models, there is no constraint that the answer should be a span within the context."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IpX0w2PtVC3M"
-      },
-      "source": [
-        "# Installing NeMo"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "72XWYFQYVC3M"
-      },
-      "source": [
-        "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-        "\n",
-        "Instructions for setting up Colab are as follows:\n",
-        "1. Open a new Python 3 notebook.\n",
-        "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-        "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-        "4. Run the cell below to set up dependencies."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "_xQBtr0KVC3M"
-      },
-      "outputs": [],
-      "source": [
-        "BRANCH = 'main'"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9R1D6W58VC3N"
-      },
-      "outputs": [],
-      "source": [
-        "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fof5-57iVC3N"
-      },
-      "source": [
-        "# Imports and constants"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "KqKD-wReVC3O"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "import wget\n",
-        "import gc\n",
-        "\n",
-        "import pytorch_lightning as pl\n",
-        "from omegaconf import OmegaConf\n",
-        "\n",
-        "from nemo.collections.nlp.models.question_answering.qa_bert_model import BERTQAModel\n",
-        "from nemo.collections.nlp.models.question_answering.qa_gpt_model import GPTQAModel\n",
-        "from nemo.collections.nlp.models.question_answering.qa_s2s_model import S2SQAModel\n",
-        "from nemo.utils.exp_manager import exp_manager\n",
-        "\n",
-        "pl.seed_everything(42)\n",
-        "gc.disable()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xhPr9Jf_VC3O"
-      },
-      "outputs": [],
-      "source": [
-        "# set the following paths\n",
-        "DATA_DIR = \"data_dir\" # directory for storing datasets\n",
-        "WORK_DIR = \"work_dir\" # directory for storing trained models, logs, additionally downloaded scripts\n",
-        "\n",
-        "os.makedirs(DATA_DIR, exist_ok=True)\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dWymW8e0VC3O"
-      },
-      "source": [
-        "# Configuration"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0YhKTkuXVC3P"
-      },
-      "source": [
-        "The model is defined in a config file which declares multiple important sections:\n",
-        "- **model**: All arguments that will relate to the Model - language model, span prediction, optimizer and schedulers, datasets and any other related information\n",
-        "- **trainer**: Any argument to be passed to PyTorch Lightning\n",
-        "- **exp_manager**: All arguments used for setting up the experiment manager - target directory, name, logger information\n",
-        "\n",
-        "We will download the default config file provided at `NeMo/examples/nlp/question_answering/conf/qa_conf.yaml` and edit necessary values for training different models"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WOIWJqQ0VC3P"
-      },
-      "outputs": [],
-      "source": [
-        "# download the model's default configuration file \n",
-        "config_dir = WORK_DIR + '/conf/'\n",
-        "os.makedirs(config_dir, exist_ok=True)\n",
-        "if not os.path.exists(config_dir + \"qa_conf.yaml\"):\n",
-        "    print('Downloading config file...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/conf/qa_conf.yaml', config_dir)\n",
-        "else:\n",
-        "    print ('config file already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cvD-gv-FVC3P"
-      },
-      "outputs": [],
-      "source": [
-        "# this will print the entire default config of the model\n",
-        "config_path = f'{WORK_DIR}/conf/qa_conf.yaml'\n",
-        "print(config_path)\n",
-        "config = OmegaConf.load(config_path)\n",
-        "print(\"Default Config - \\n\")\n",
-        "print(OmegaConf.to_yaml(config))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "E08e-ItPVC3P"
-      },
-      "source": [
-        "# Training and testing models on SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xn022MsKVC3Q"
-      },
-      "source": [
-        "## Dataset"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "c356CGL1VC3Q"
-      },
-      "source": [
-        "For this example, we are going to download the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset to showcase how to do training and inference. There are two datasets, SQuAD1.0 and SQuAD2.0. SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles. SQuAD2.0 dataset combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Gaju1h_bVC3Q"
-      },
-      "source": [
-        "To download both datasets, we use `NeMo/examples/nlp/question_answering/get_squad.py`"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "nb840_bZVC3Q"
-      },
-      "outputs": [],
-      "source": [
-        "# download get_squad.py script to download and preprocess the SQuAD data\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)\n",
-        "if not os.path.exists(WORK_DIR + '/get_squad.py'):\n",
-        "    print('Downloading get_squad.py...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/get_squad.py', WORK_DIR)\n",
-        "else:\n",
-        "    print ('get_squad.py already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "sOgY0tRzVC3Q"
-      },
-      "outputs": [],
-      "source": [
-        "# download and preprocess the data\n",
-        "!python $WORK_DIR/get_squad.py --destDir $DATA_DIR"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nprGkyvRVC3Q"
-      },
-      "source": [
-        "After execution of the above cell, your data folder will contain a subfolder \"squad\" the following four files for training and evaluation\n",
-        "\n",
-        "```\n",
-        "squad  \n",
-        "│\n",
-        "└───v1.1\n",
-        "│   │ -  train-v1.1.json\n",
-        "│   │ -  dev-v1.1.json\n",
-        "│\n",
-        "└───v2.0\n",
-        "    │ -  train-v2.0.json\n",
-        "    │ -  dev-v2.0.json\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "GX0KWQXKVC3Q"
-      },
-      "outputs": [],
-      "source": [
-        "!ls -LR {DATA_DIR}/squad"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RFVcvseOVC3R"
-      },
-      "source": [
-        "## Set dataset config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Grb0EeRqVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "# if True, model will load features from cache if file is present, or\n",
-        "# create features and dump to cache file if not already present\n",
-        "config.model.dataset.use_cache = False\n",
-        "\n",
-        "# indicates whether the dataset has unanswerable questions\n",
-        "config.model.dataset.version_2_with_negative = True\n",
-        "\n",
-        "# indicates whether the dataset is of extractive nature or not\n",
-        "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n",
-        "config.model.dataset.check_if_answer_in_context = True\n",
-        "\n",
-        "# set file paths for train, validation, and test datasets\n",
-        "config.model.train_ds.file = f\"{DATA_DIR}/squad/v2.0/train-v2.0.json\"\n",
-        "config.model.validation_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n",
-        "config.model.test_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n",
-        "\n",
-        "# set batch sizes for train, validation, and test datasets\n",
-        "config.model.train_ds.batch_size = 8\n",
-        "config.model.validation_ds.batch_size = 8\n",
-        "config.model.test_ds.batch_size = 8\n",
-        "\n",
-        "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n",
-        "config.model.train_ds.num_samples = 5000\n",
-        "config.model.validation_ds.num_samples = 1000\n",
-        "config.model.test_ds.num_samples = 100"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rFWF41VwVC3R"
-      },
-      "source": [
-        "## Set trainer config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "42yif-GIVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "config.trainer.max_epochs = 1\n",
-        "config.trainer.max_steps = -1 # takes precedence over max_epochs\n",
-        "config.trainer.precision = 16\n",
-        "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use [0] this tutorial does not support multiple GPUs. If needed please use NeMo/examples/nlp/question_answering/question_answering.py\n",
-        "config.trainer.accelerator = \"gpu\"\n",
-        "config.trainer.strategy=\"auto\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EDQzMBlbVC3R"
-      },
-      "source": [
-        "## Set experiment manager config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "pxY4rnJBVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "config.exp_manager.exp_dir = WORK_DIR\n",
-        "config.exp_manager.name = \"QA-SQuAD2\"\n",
-        "config.exp_manager.create_wandb_logger=False"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "N2_C8reNVC3R"
-      },
-      "source": [
-        "## BERT model for SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4Mf-_rioVC3R"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gtlGHzVJVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"bert-base-uncased\"\n",
-        "config.model.tokenizer.tokenizer_name = \"bert-base-uncased\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bert_squad_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 3e-5"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RaM7fe8rVC3R"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ukLzGmy9VC3R"
-      },
-      "outputs": [],
-      "source": [
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = BERTQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qZIA69rlVC3R"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "asutB9ZzVC3R"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "n5AIv0SEVC3S"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7k5kD6tvVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "model = BERTQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zyh0SNiyVC3S"
-      },
-      "source": [
-        "## S2S BART model for SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Sy9IYgVYVC3S"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "PKNmHKV5VC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n",
-        "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_squad_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 5e-5\n",
-        "\n",
-        "#remove vocab_file from gpt model\n",
-        "config.model.tokenizer.vocab_file = None"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "S_0glS4yVC3S"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "8jWyHY1oVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n",
-        "# !rm -r /root/.cache/huggingface/\n",
-        "\n",
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = S2SQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xg-j39b4VC3S"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ocsf0EBDVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Vs3pl0VMVC3S"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NoW6_GO_VC3S"
-      },
-      "outputs": [],
-      "source": [
-        "model = S2SQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "a7-iInbPVC3S"
-      },
-      "source": [
-        "## GPT2 model for SQuAD v2.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VaIC0l2aVC3S"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "5j6SVk6fVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"gpt2\"\n",
-        "config.model.tokenizer.tokenizer_name = \"gpt2\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/gpt2_squad_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 1e-4"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rWhhEuvzVC3S"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "vBtP3ukDVC3S"
-      },
-      "outputs": [],
-      "source": [
-        "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n",
-        "# !rm -r /root/.cache/huggingface/\n",
-        "\n",
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = GPTQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EApFrJh8VC3T"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zYo2JDdOVC3T"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6aNEt06fVC3T"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ioLT4DVbVC3T"
-      },
-      "outputs": [],
-      "source": [
-        "model = GPTQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hTWOlD9AVC3T"
-      },
-      "source": [
-        "# Training and testing models on MS-MARCO"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "lZWsMwnGVC3T"
-      },
-      "source": [
-        "## Dataset"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pRUAwgAbVC3T"
-      },
-      "source": [
-        "### Downloading the data"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qz3DO9JGVC3T"
-      },
-      "source": [
-        "MS-MARCO(Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. MS-MARCO consists of 1,010,916 queries generated from real, anonymized Bing user queries. The contexts are extracted from real web documents and the answers are generated by humans.\n",
-        "\n",
-        "Please agree to the Terms of Use at https://microsoft.github.io/msmarco/ before downloading the data\n",
-        "\n",
-        "The data can be downloaded at:\n",
-        "- https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n",
-        "- https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Fm5MzZ91inP5"
-      },
-      "outputs": [],
-      "source": [
-        "os.makedirs(os.path.join(DATA_DIR, \"msmarco\"), exist_ok=True)\n",
-        "\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -P $DATA_DIR/msmarco\n",
-        "!gunzip $DATA_DIR/msmarco/train_v2.1.json.gz\n",
-        "\n",
-        "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -P $DATA_DIR/msmarco\n",
-        "!gunzip $DATA_DIR/msmarco/dev_v2.1.json.gz"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nDmFHzBtVC3T"
-      },
-      "source": [
-        "### Converting to SQuAD format\n",
-        "\n",
-        "The script for converting MS-MARCO dataset to SQuAD can be found at `NeMo/examples/nlp/question_answering/convert_msmarco_to_squad_format.py`"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "tJtNIzZQVC3T"
-      },
-      "outputs": [],
-      "source": [
-        "# download convert_msmarco_to_squad_format.py script to format the MS-MARCO data\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)\n",
-        "if not os.path.exists(WORK_DIR + '/convert_msmarco_to_squad_format.py'):\n",
-        "    print('Downloading convert_msmarco_to_squad_format.py...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/convert_msmarco_to_squad_format.py', WORK_DIR)\n",
-        "else:\n",
-        "    print ('convert_msmarco_to_squad_format.py already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Io_esJPSuBcW"
-      },
-      "outputs": [],
-      "source": [
-        "# we will exclude examples from MS-MARCO dataset that do not have a wellFormedAnswer using a utility script\n",
-        "# download remove_ms_marco_samples_without_wellFormedAnswers.py script to format the MS-MARCO data\n",
-        "os.makedirs(WORK_DIR, exist_ok=True)\n",
-        "if not os.path.exists(WORK_DIR + '/remove_ms_marco_samples_without_wellFormedAnswers.py'):\n",
-        "    print('Downloading remove_ms_marco_samples_without_wellFormedAnswers.py...')\n",
-        "    wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py', WORK_DIR)\n",
-        "else:\n",
-        "    print ('remove_ms_marco_samples_without_wellFormedAnswers.py already exists')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cs_CXkfXuYVQ"
-      },
-      "outputs": [],
-      "source": [
-        "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/train_v2.1.json\n",
-        "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/dev_v2.1.json"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "AUAKI086VC3T"
-      },
-      "outputs": [],
-      "source": [
-        "!(python $WORK_DIR/convert_msmarco_to_squad_format.py \\\n",
-        "    --msmarco_train_input_filepath=$DATA_DIR/msmarco/train_v2.1.json \\\n",
-        "    --msmarco_dev_input_filepath=$DATA_DIR/msmarco/dev_v2.1.json \\\n",
-        "    --converted_train_save_path=$DATA_DIR/msmarco/msmarco-squad-format-train-v2.1.json \\\n",
-        "    --converted_dev_save_path=$DATA_DIR/msmarco/msmarco-squad-format-dev-v2.1.json \\\n",
-        "    --exclude_negative_samples=False \\\n",
-        "    --keep_only_relevant_passages=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AeHesaFcVC3T"
-      },
-      "source": [
-        "## Set dataset config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rhx-_1X3VC3T"
-      },
-      "outputs": [],
-      "source": [
-        "# if True, model will load features from cache if file is present, or\n",
-        "# create features and dump to cache file if not already present\n",
-        "config.model.dataset.use_cache = False\n",
-        "\n",
-        "# indicates whether the dataset has unanswerable questions\n",
-        "config.model.dataset.version_2_with_negative = True\n",
-        "\n",
-        "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n",
-        "# should be False for MS-MARCO dataset, or other datasets of generative nature\n",
-        "config.model.dataset.check_if_answer_in_context = False\n",
-        "\n",
-        "# set file paths for train, validation, and test datasets\n",
-        "config.model.train_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-train-v2.1.json\"\n",
-        "config.model.validation_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n",
-        "config.model.test_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n",
-        "\n",
-        "# set batch sizes for train, validation, and test datasets\n",
-        "config.model.train_ds.batch_size = 16\n",
-        "config.model.validation_ds.batch_size = 16\n",
-        "config.model.test_ds.batch_size = 16\n",
-        "\n",
-        "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n",
-        "config.model.train_ds.num_samples = 5000\n",
-        "config.model.validation_ds.num_samples = 1000\n",
-        "config.model.test_ds.num_samples = 100"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "X43k_EeqVC3T"
-      },
-      "source": [
-        "## Set trainer config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HavpkQLPVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "config.trainer.max_epochs = 1\n",
-        "config.trainer.max_steps = -1 # takes precedence over max_epochs\n",
-        "config.trainer.precision = 16\n",
-        "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use e.g. [0, 1] or [0]\n",
-        "config.trainer.accelerator = \"gpu\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "R-_FIZE2VC3U"
-      },
-      "source": [
-        "## Set experiment manager config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "10TT3okiVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "config.exp_manager.exp_dir = WORK_DIR\n",
-        "config.exp_manager.name = \"QA-MSMARCO\"\n",
-        "config.exp_manager.create_wandb_logger=False"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MKIq6YT-VC3U"
-      },
-      "source": [
-        "## S2S BART model for MS-MARCO"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tvf-QpYLVC3U"
-      },
-      "source": [
-        "### Set model config values"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "DDVZ1a5fVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "# set language model and tokenizer to be used\n",
-        "# tokenizer is derived from model if a tokenizer name is not provided\n",
-        "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n",
-        "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n",
-        "\n",
-        "# path where model will be saved\n",
-        "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_msmarco_v2_0.nemo\"\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = True\n",
-        "\n",
-        "config.model.optim.lr = 5e-5"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "3N75cdLRVC3U"
-      },
-      "source": [
-        "### Create trainer and initialize model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Bv9UMkfxVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "trainer = pl.Trainer(**config.trainer)\n",
-        "model = S2SQAModel(config.model, trainer=trainer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BhVuV9sWVC3U"
-      },
-      "source": [
-        "### Train, test, and save the model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "1JeaJ_OgVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "trainer.fit(model)\n",
-        "trainer.test(model)\n",
-        "\n",
-        "model.save_to(config.model.nemo_path)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "yj0dGexaVC3U"
-      },
-      "source": [
-        "### Load the saved model and run inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "l1elN-WDVC3U"
-      },
-      "outputs": [],
-      "source": [
-        "model = S2SQAModel.restore_from(config.model.nemo_path)\n",
-        "\n",
-        "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n",
-        "model.trainer = pl.Trainer(\n",
-        "    devices=eval_device,\n",
-        "    accelerator=config.trainer.accelerator,\n",
-        "    precision=16,\n",
-        "    logger=False,\n",
-        ")\n",
-        "\n",
-        "config.exp_manager.create_checkpoint_callback = False\n",
-        "exp_dir = exp_manager(model.trainer, config.exp_manager)\n",
-        "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n",
-        "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n",
-        "\n",
-        "all_preds, all_nbest = model.inference(\n",
-        "    config.model.test_ds.file,\n",
-        "    output_prediction_file=output_prediction_file,\n",
-        "    output_nbest_file=output_nbest_file,\n",
-        "    num_samples=10, # setting to -1 will use all samples for inference\n",
-        ")\n",
-        "\n",
-        "for question_id in all_preds:\n",
-        "    print(all_preds[question_id])"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Question_Answering.ipynb",
-      "provenance": []
-    },
-    "gpuClass": "standard",
-    "kernelspec": {
-      "display_name": "Python 3.8.0 ('test_ptl_1.7')",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.8.0"
-    },
-    "orig_nbformat": 4,
-    "vscode": {
-      "interpreter": {
-        "hash": "e987a19b1bc60996a600adb5d563aa4a4c022e7b31abb2e65c324714934e8ea9"
-      }
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
deleted file mode 100644
index 71c7ca505144..000000000000
--- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
+++ /dev/null
@@ -1,1412 +0,0 @@
-{
-  "cells": [
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PiRuohn_FQco"
-      },
-      "source": [
-        "# Overview\n",
-        "This tutorial demonstrates how to run inference with [SpellMapper](https://arxiv.org/abs/2306.02317) - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n",
-        "\n",
-        "Estimated time: 10-15 min.\n",
-        "\n",
-        "SpellMapper is a non-autoregressive (NAR) model based on transformer architecture ([BERT](https://arxiv.org/pdf/1810.04805.pdf) with multiple separators).\n",
-        "It gets as input a single ASR hypothesis (text) and a **custom vocabulary** and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any.\n",
-        "\n",
-        "This model is an alternative to word boosting/shallow fusion approaches:\n",
-        "  - does not require retraining ASR model;\n",
-        "  - does not require beam-search/language model(LM);\n",
-        "  - can be applied on top of any English ASR model output;"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qm5wmxVEGXgH"
-      },
-      "source": [
-        "## What is custom vocabulary?\n",
-        "**Custom vocabulary** is a list of words/phrases that are important for a particular user. For example, user's contact names, playlist, selected terminology and so on. The size of the custom vocabulary can vary from several hundreds to **several thousand entries** - but this is not an equivalent to ngram language model.\n",
-        "\n",
-        "![Scope of customization with user vocabulary](images/spellmapper_customization_vocabulary.png)\n",
-        "\n",
-        "Note that unlike traditional spellchecking approaches, which aim to correct known words using language models, the goal of contextual spelling correction is to correct highly specific user terms, most of which can be 1) out-of-vocabulary (OOV) words, 2) spelling variations (e.g., \"John Koehn\", \"Jon Cohen\") and language models cannot help much with that."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "D5_XwuXDOKho"
-      },
-      "source": [
-        "## Tutorial Plan\n",
-        "\n",
-        "1.   Create a sample custom vocabulary using some medical terminology.\n",
-        "2.   Study what customization does - a detailed analysis of a small example.\n",
-        "3.   Run a bigger example:\n",
-        "   *  Create sample ASR results by running TTS (text-to-speech synthesis) + ASR on some medical paper abstracts.\n",
-        "   *  Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n",
-        "\n",
-        "TL;DR We reduce WER from `14.3%` to `11.4%` by correcting medical terms, e.g.\n",
-        "* `puramesin` => `puromycin`\n",
-        "* `parromsin` => `puromycin`\n",
-        "* `and hydrod` => `anhydride`\n",
-        "* `lesh night and` => `lesch-nyhan`\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "agz8B2CxXBBG"
-      },
-      "source": [
-        "# Preparation"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "koRPpYISNPuH"
-      },
-      "source": [
-        "## Installing NeMo"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "HCnnz3cgVc4Q"
-      },
-      "outputs": [],
-      "source": [
-        "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n",
-        "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
-        "GITHUB_ACCOUNT = \"NVIDIA\"\n",
-        "BRANCH = 'main'\n",
-        "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n",
-        "\n",
-        "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n",
-        "# comment out the below lines and set NEMO_DIR to your local path.\n",
-        "NEMO_DIR = 'nemo'\n",
-        "!git clone -b {BRANCH} https://github.com/{GITHUB_ACCOUNT}/NeMo.git $NEMO_DIR"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_M92gCn_NW1_"
-      },
-      "source": [
-        "## Additional installs\n",
-        "We will use `sentence_splitter` to split abstracts to sentences."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ddyJA3NtGl9C"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install sentence_splitter"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qVa91rGkeFje"
-      },
-      "source": [
-        "Clone the SpellMapper model from HuggingFace.\n",
-        "Note that we will need not only the checkpoint itself, but also the ngram mapping vocabulary `replacement_vocab_filt.txt` from the same folder."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JiI9dkEm5cpW"
-      },
-      "outputs": [],
-      "source": [
-        "!git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8saqFOePVfFf"
-      },
-      "source": [
-        "## Imports\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "tAJyiYn_VnrF"
-      },
-      "outputs": [],
-      "source": [
-        "import IPython.display as ipd\n",
-        "import json\n",
-        "import random\n",
-        "import re\n",
-        "import soundfile as sf\n",
-        "import torch\n",
-        "\n",
-        "from collections import Counter, defaultdict\n",
-        "from difflib import SequenceMatcher\n",
-        "from matplotlib.pyplot import imshow\n",
-        "from matplotlib import pyplot as plt\n",
-        "from sentence_splitter import SentenceSplitter\n",
-        "from typing import List, Set, Tuple\n",
-        "\n",
-        "from nemo.collections.tts.models import FastPitchModel\n",
-        "from nemo.collections.tts.models import HifiGanModel\n",
-        "\n",
-        "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n",
-        "\n",
-        "from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (\n",
-        "    get_all_candidates_coverage,\n",
-        "    get_index,\n",
-        "    load_ngram_mappings,\n",
-        "    search_in_index,\n",
-        "    get_candidates,\n",
-        "    read_spellmapper_predictions,\n",
-        "    apply_replacements_to_text,\n",
-        "    load_ngram_mappings_for_dp,\n",
-        "    get_alignment_by_dp,\n",
-        ")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mfAaOdAWUGUV"
-      },
-      "source": [
-        "Use seed to get a reproducible behaviour."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "UlGnNKTuT_6A"
-      },
-      "outputs": [],
-      "source": [
-        "random.seed(0)\n",
-        "torch.manual_seed(0)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RPPHI7Zd_fDz"
-      },
-      "source": [
-        "## Download data\n",
-        "\n",
-        "File `pubmed24n0009.xml` taken from public ftp server of https://www.ncbi.nlm.nih.gov/pmc/ contains information about 5593 medical papers, from which we extract only their abstracts. We will feed sentences from there to TTS + ASR to get initial ASR results.\n",
-        "\n",
-        "File `wordlist.txt` contains 100k **single-word** medical terms.\n",
-        "\n",
-        "File `valid_adam.txt` contains 24k medical abbreviations with their full forms. We will use those full forms as examples of **multi-word** medical terms.\n",
-        "\n",
-        "File `count_1w.txt` contains 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mX6cvE8xw2n1"
-      },
-      "outputs": [],
-      "source": [
-        "!wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0009.xml.gz\n",
-        "!gunzip pubmed24n0009.xml.gz\n",
-        "!grep \"AbstractText\" pubmed24n0009.xml > abstract.txt\n",
-        "\n",
-        "!wget https://raw.githubusercontent.com/McGill-NLP/medal/master/toy_data/valid_adam.txt\n",
-        "!wget https://raw.githubusercontent.com/glutanimate/wordlist-medicalterms-en/master/wordlist.txt\n",
-        "!wget https://norvig.com/ngrams/count_1w.txt"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mBm9BeqNaRlC"
-      },
-      "source": [
-        "## Auxiliary functions\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kVUKhSh48Ypi"
-      },
-      "outputs": [],
-      "source": [
-        "CHARS_TO_IGNORE_REGEX = re.compile(r\"[\\.\\,\\?\\:!;()«»…\\]\\[/\\*–‽+&_\\\\½√>€™$•¼}{~—=“\\\"”″‟„]\")\n",
-        "\n",
-        "\n",
-        "def get_medical_vocabulary() -> Tuple[Set[str], Set[str]]:\n",
-        "    \"\"\"This function builds a vocabulary of medical terms using downloaded sources:\n",
-        "        wordlist.txt - 100k single-word medical terms.\n",
-        "        valid_adam.txt - 24k medical abbreviations with their full forms. We use those full forms as examples of multi-word medical terms.\n",
-        "        count_1w.txt - 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n",
-        "    \"\"\"\n",
-        "    common_words  = set()\n",
-        "    with open(\"count_1w.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        for line in f:\n",
-        "            word, freq = line.strip().casefold().split(\"\\t\")\n",
-        "            if int(freq) < 500000:\n",
-        "                break\n",
-        "            common_words.add(word)\n",
-        "    print(\"Size of common words vocabulary:\", len(common_words))\n",
-        "\n",
-        "    abbreviations = defaultdict(set)\n",
-        "    medical_vocabulary = set()\n",
-        "    with open(\"valid_adam.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        lines = f.readlines()\n",
-        "        # first line is header\n",
-        "        for line in lines[1:]:\n",
-        "            abbrev, _, phrase = line.strip().split(\"\\t\")\n",
-        "            # skip phrases longer than 3 words because some of them are long explanations\n",
-        "            if phrase.count(\" \") > 2:\n",
-        "                continue\n",
-        "            if phrase in common_words:\n",
-        "                continue\n",
-        "            medical_vocabulary.add(phrase)\n",
-        "            abbrev = abbrev.lower()\n",
-        "            abbreviations[abbrev].add(phrase)\n",
-        "\n",
-        "    with open(\"wordlist.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        for line in f:\n",
-        "            word = line.strip().casefold()\n",
-        "            # skip words containing digits\n",
-        "            if re.match(r\".*\\d.*\", word):\n",
-        "                continue\n",
-        "            if re.match(r\".*[\\[\\]\\(\\)\\+\\,\\.].*\", word):\n",
-        "                continue\n",
-        "            if word in common_words:\n",
-        "                continue\n",
-        "            medical_vocabulary.add(word)\n",
-        "\n",
-        "    print(\"Size of medical vocabulary:\", len(medical_vocabulary))\n",
-        "    print(\"Size of abbreviation vocabulary:\", len(abbreviations))\n",
-        "    return medical_vocabulary, abbreviations\n",
-        "\n",
-        "\n",
-        "def read_abstracts(medical_vocabulary: Set[str]) -> Tuple[List[str], Set[str], Set[str]]:\n",
-        "    \"\"\"This function reads the downloaded medical abstracts, and extracts sentences containing any word/phrase from the medical vocabulary.\n",
-        "    Args:\n",
-        "        medical_vocabulary: set of known medical words or phrases\n",
-        "    Returns:\n",
-        "        sentences: list of extracted sentences\n",
-        "        all_found_singleword: set of single words from medical vocabulary that occurred at least in one sentence\n",
-        "        all_found_multiword: set of multi-word phrases from medical vocabulary that occurred at least in one sentence\n",
-        "    \"\"\"\n",
-        "    splitter = SentenceSplitter(language='en')\n",
-        "\n",
-        "    all_sentences = []\n",
-        "    all_found_singleword = set()\n",
-        "    all_found_multiword = set()\n",
-        "    with open(\"abstract.txt\", \"r\", encoding=\"utf-8\") as f:\n",
-        "        for line in f:\n",
-        "            text = line.strip().replace(\"<AbstractText>\", \"\").replace(\"</AbstractText>\", \"\")\n",
-        "            sents = splitter.split(text)\n",
-        "            found_singleword = set()\n",
-        "            found_multiword = set()\n",
-        "            for sent in sents:\n",
-        "                # remove anything in brackets from text\n",
-        "                sent = re.sub(r\"\\(.+\\)\", r\"\", sent)\n",
-        "                # remove quotes from text\n",
-        "                sent = sent.replace(\"\\\"\", \"\")\n",
-        "                # skip sentences containing digits because normalization is out of scope of this tutorial\n",
-        "                if re.match(r\".*\\d.*\", sent):\n",
-        "                    continue\n",
-        "                # skip sentences containing abbreviations with period inside the sentence (for the same reason)\n",
-        "                if \". \" in sent:\n",
-        "                    continue\n",
-        "                # skip long sentences as they may cause OOM issues\n",
-        "                if len(sent) > 150:\n",
-        "                    continue\n",
-        "                # replace all punctuation to space and convert to lowercase\n",
-        "                sent_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", sent).lower()\n",
-        "                sent_clean = \" \".join(sent_clean.split(\" \"))\n",
-        "                words = sent_clean.split(\" \")\n",
-        "\n",
-        "                found_phrases = set()\n",
-        "                for begin in range(len(words)):\n",
-        "                    for end in range(begin + 1, min(begin + 4, len(words))):\n",
-        "                        phrase = \" \".join(words[begin:end])\n",
-        "                        if phrase in medical_vocabulary:\n",
-        "                            found_phrases.add(phrase)\n",
-        "                            if end - begin == 1:\n",
-        "                                found_singleword.add(phrase)\n",
-        "                            else:\n",
-        "                                found_multiword.add(phrase)\n",
-        "                if len(found_phrases) > 0:\n",
-        "                    all_sentences.append((sent, \";\".join(found_phrases)))\n",
-        "            all_found_singleword = all_found_singleword.union(found_singleword)\n",
-        "            all_found_multiword = all_found_multiword.union(found_multiword)\n",
-        "\n",
-        "    print(\"Sentences:\", len(all_sentences))\n",
-        "    print(\"Unique single-word terms found:\", len(all_found_singleword))\n",
-        "    print(\"Unique multi-word terms found:\", len(all_found_multiword))\n",
-        "    print(\"Examples of multi-word terms\", str(list(all_found_multiword)[0:10]))\n",
-        "    \n",
-        "    return all_sentences, all_found_singleword, all_found_multiword"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "XU3xeCBVpWOL"
-      },
-      "outputs": [],
-      "source": [
-        "def get_fragments(i_words: List[str], j_words: List[str]) -> List[Tuple[str, str, str, int, int, int, int]]:\n",
-        "    \"\"\"This function is used to compare two word sequences to find minimal fragments that differ.\n",
-        "    Args:\n",
-        "        i_words: list of words in first sequence\n",
-        "        j_words: list of words in second sequence\n",
-        "    Returns:\n",
-        "        list of tuples (difference_type, fragment1, fragment2, begin_of_fragment1, end_of_fragment1, begin_of_fragment2, end_of_fragment2)\n",
-        "    \"\"\"\n",
-        "    s = SequenceMatcher(None, i_words, j_words)\n",
-        "    result = []\n",
-        "    for tag, i1, i2, j1, j2 in s.get_opcodes():\n",
-        "        result.append((tag, \" \".join(i_words[i1:i2]), \" \".join(j_words[j1:j2]), i1, i2, j1, j2))\n",
-        "    result = sorted(result, key=lambda x: x[3])\n",
-        "    return result"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2ydXp_pFYmYu"
-      },
-      "source": [
-        "## Read medical data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WAeauax0SV1-"
-      },
-      "outputs": [],
-      "source": [
-        "medical_vocabulary, _ = get_medical_vocabulary()\n",
-        "sentences, found_singleword, found_multiword = read_abstracts(medical_vocabulary)\n",
-        "# in case if we need random candidates from a big sample - we will use full medical vocabulary for that purpose.\n",
-        "big_sample = list(medical_vocabulary)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "FRli7-Kx7sOO"
-      },
-      "outputs": [],
-      "source": [
-        "for sent, phrases in sentences[0:10]:\n",
-        "    print(sent, \"\\t\", phrases)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rL1VqH2_dk93"
-      },
-      "source": [
-        "# SpellMapper ASR Customization\n",
-        "\n",
-        "SpellMapper model relies on two offline preparation steps:\n",
-        "1. Collecting n-gram mappings from a large corpus (this mappings vocabulary had been collected once on a large corpus and is supplied with the model).\n",
-        "2. Indexing of user vocabulary by n-grams.\n",
-        "\n",
-        "![Offline data preparation](images/spellmapper_data_preparation.png)\n",
-        "\n",
-        "At inference time we take as input an ASR hypothesis and an n-gram-indexed user vocabulary and perform following steps:\n",
-        "1. Retrieve the top 10 candidate phrases from the user vocabulary that are likely to be contained in the given ASR-hypothesis, possibly in a misspelled form.\n",
-        "2. Run the neural model that tags the input characters with correct candidate labels or 0 if no match is found.\n",
-        "3. Do post-processing to combine results.\n",
-        "\n",
-        "![Inference pipeline](images/spellmapper_inference_pipeline.png)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OeJpsMwslmrd"
-      },
-      "source": [
-        "## N-gram mappings\n",
-        "Note that n-gram mappings vocabulary had been collected from a large corpus and is supplied with the model. It is supposed to be \"universal\" for English language.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uH6p0mOd12pi"
-      },
-      "source": [
-        "Let's see what n-gram mappings are like, for example, for an n-gram `l u c`.\n",
-        "Note that n-grams in `replacement_vocab_filt.txt` preserve one-to-one correspondence between original letters and misspelled fragments (this additional markup is handled during loading). \n",
-        "* `+` means that adjacent letters are concatenated and correspond to a single source letter. \n",
-        "* `<DELETE>` means that the original letter is deleted. \n",
-        "This auxiliary markup will be removed automatically during loading.\n",
-        "\n",
-        "`_` is used instead of real space symbol.\n",
-        "\n",
-        "Last three columns are:\n",
-        "* joint frequency\n",
-        "* frequency of original n-gram\n",
-        "* frequency of misspelled n-gram\n",
-        "\n",
-        "$$\\frac{JointFrequency}{SourceFrequency}=TranslationProbability$$\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qul163dB1sKp"
-      },
-      "outputs": [],
-      "source": [
-        "!awk 'BEGIN {FS=\"\\t\"} ($1==\"l u c\"){print $0}' < spellmapper_asr_customization_en/replacement_vocab_filt.txt | sort -t$'\\t' -k3nr"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "eWxcrVWZ3Pfq"
-      },
-      "source": [
-        "Now we read n-gram mappings from the file. Parameter `max_misspelled_freq` controls maximum frequency of misspelled n-grams. N-grams more frequent than that are put in the list of banned n-grams and won't be used in indexing."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WHKhE945-N7o"
-      },
-      "outputs": [],
-      "source": [
-        "print(\"load n-gram mappings...\")\n",
-        "ngram_mapping_vocab, ban_ngram = load_ngram_mappings(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\", max_misspelled_freq=125000)\n",
-        "# CAUTION: entries in ban_ngram end with a space and can contain \"+\" \"=\"\n",
-        "print(\"Size of ngram mapping vocabulary:\", len(ngram_mapping_vocab))\n",
-        "print(\"Size of banned ngrams:\", len(ban_ngram))\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "49IcMBfllvXN"
-      },
-      "source": [
-        "## Indexing of custom vocabulary"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "b1K6paeee2Iu"
-      },
-      "source": [
-        "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occurred in given corpus of abstracts.\n",
-        "\n",
-        "The goal of indexing our custom vocabulary is to build an index where key is a letter n-gram and value is the whole phrase. The keys are n-grams in the given user phrase and their misspelled variants taken from our collection of n-\n",
-        "gram mappings (see Index of custom vocabulary in Fig. 1)\n",
-        "\n",
-        "*Though it is possible to index and search the whole 110k vocabulary, it will require additional optimizations and is beyond the scope of this tutorial.*"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xWb0jGqw6Woi"
-      },
-      "outputs": [],
-      "source": [
-        "custom_phrases = []\n",
-        "for phrase in medical_vocabulary:\n",
-        "    if phrase not in found_singleword and phrase not in found_multiword:\n",
-        "        continue\n",
-        "    custom_phrases.append(\" \".join(list(phrase.replace(\" \", \"_\"))))\n",
-        "print(\"Size of customization vocabulary:\", len(custom_phrases))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UHWor5pD2Eyb"
-      },
-      "source": [
-        "Now we build the index for our custom phrases.\n",
-        "\n",
-        "Parameter `min_log_prob` controls minimum log probability, after which we stop growing this n-gram.\n",
-        "\n",
-        "Parameter `max_phrases_per_ngram` controls maximum number of phrases that can be indexed by one ngram. N-grams exceeding this limit are also banned and not used in indexing.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "hs4RDXj0-xW9"
-      },
-      "outputs": [],
-      "source": [
-        "phrases, ngram2phrases = get_index(custom_phrases, ngram_mapping_vocab, ban_ngram, min_log_prob=-4.0, max_phrases_per_ngram=600)\n",
-        "print(\"Size of phrases:\", len(phrases))\n",
-        "print(\"Size of ngram2phrases:\", len(ngram2phrases))\n",
-        "\n",
-        "# Save index to file - later we will use it in other script\n",
-        "with open(\"index.txt\", \"w\", encoding=\"utf-8\") as out:\n",
-        "    for ngram in ngram2phrases:\n",
-        "        for phrase_id, begin, size, logprob in ngram2phrases[ngram]:\n",
-        "            phrase = phrases[phrase_id]\n",
-        "            out.write(ngram + \"\\t\" + phrase + \"\\t\" + str(begin) + \"\\t\" + str(size) + \"\\t\" + str(logprob) + \"\\n\")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RV1sdQ9rvar8"
-      },
-      "source": [
-        "## Small detailed example\n",
-        "\n",
-        "Let's consider, for example, one custom phrase `thoracic aorta` and an incorrect ASR-hypothesis `the tarasic oorda is a part of the aorta located in the thorax`, containing a misspelled phrase `tarasic_oorda`. \n",
-        "\n",
-        "We will see \n",
-        "1. How this custom phrase is indexed.\n",
-        "2. How candidate retrieval works, given ASR-hypothesis.\n",
-        "3. How inference and post-processing work.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kGBTTJXixnrG"
-      },
-      "source": [
-        "### N-grams in index"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ryfUlqNMl4vQ"
-      },
-      "source": [
-        "Let's look, for example, by what n-grams a custom phrase `thoracic aorta` is indexed. \n",
-        "Columns: \n",
-        "1. n-gram\n",
-        "2. beginning position in the phrase\n",
-        "3. length\n",
-        "4. log probability\n",
-        "\n",
-        "Note that many n-grams are not from n-gram mappings file. Those are derived by  growing previous n-grams with new replacements. In this case log probabilities are summed up. Growing stops, when minimum log prob is exceeded.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "x0ZVsXGBo8pt"
-      },
-      "outputs": [],
-      "source": [
-        "for ngram in ngram2phrases:\n",
-        "    for phrase_id, b, length, lprob in ngram2phrases[ngram]:\n",
-        "        if phrases[phrase_id] == \"t h o r a c i c _ a o r t a\":\n",
-        "            print(ngram.ljust(16) + \"\\t\" + str(b).rjust(4) + \"\\t\" + str(length).rjust(4) + \"\\t\" + str(lprob))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "20ov23ze4xeQ"
-      },
-      "source": [
-        "### Candidate retrieval\n",
-        "Candidate retrieval tasks are:\n",
-        " - Given an input sentence and an index of custom vocabulary find all n-grams from the index matching the sentence. \n",
-        " - Find which sentence fragments and which custom phrases have most \"hits\" - potential candidates.\n",
-        " - Find approximate starting position for each candidate phrase. \n",
-        "\n",
-        "\n",
-        "Let's look at the hits, that phrase \"thoracic aorta\" gets by searching all ngrams in the input text. We can see some hits in different part of the sentence, but a moving window can find a fragment with most hits."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "t_rhKQ3Xqa8A"
-      },
-      "outputs": [],
-      "source": [
-        "sent = \"the_tarasic_oorda_is_a_part_of_the_aorta_located_in_the_thorax\"\n",
-        "phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, sent)\n",
-        "print(\" \".join(list(sent)))\n",
-        "print(\" \".join(list(map(str, phrases2positions[phrases.index(\"t h o r a c i c _ a o r t a\")].astype(int)))))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "orkRapbjF4aZ"
-      },
-      "source": [
-        "`phrases2positions` is a matrix of size (len(phrases), len(ASR_hypothesis)).\n",
-        "It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.\n",
-        "It is used to find phrases with many hits within a contiguous window - potential matching candidates.\n",
-        "\n",
-        "`position2ngrams` is a list of sets of ngrams. List index is the starting position in the ASR-hypothesis.\n",
-        "It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JF7u4_iiHLyI"
-      },
-      "outputs": [],
-      "source": [
-        "candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions)\n",
-        "print(\"Coverage=\", candidate2coverage[phrases.index(\"t h o r a c i c _ a o r t a\")])\n",
-        "print(\"Starting position=\", candidate2position[phrases.index(\"t h o r a c i c _ a o r t a\")])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "45mvKg8ZyNbr"
-      },
-      "source": [
-        "`candidate2coverage` is a list of size len(phrases) containing coverage (0.0 to 1.0) in best window.\n",
-        "Coverage is a smoothed percentage of hits in the window of size of the given phrase.\n",
-        "\n",
-        "`candidate2position` is a list of size len(phrases) containing starting position of best window.\n",
-        "\n",
-        "Starting position is approximate, it's ok. If it is not at the beginning of some word, SpellMapper will try to adjust it later. In this particular example we get 5 as starting position instead of 4, missing the first letter."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Sjyn9I98udL9"
-      },
-      "source": [
-        "### Inference\n",
-        "\n",
-        "Now let's generate input for SpellMapper inference. \n",
-        "An input line should consist of 4 tab-separated columns:\n",
-        "  - text of ASR-hypothesis\n",
-        "  - texts of 10 candidates separated by semicolon\n",
-        "  - 1-based ids of non-dummy candidates\n",
-        "  - approximate start/end coordinates of non-dummy candidates (correspond to ids)\n",
-        "Note that candidate retrieval is done inside the function `get_candidates`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cJnusVfBRhRX"
-      },
-      "outputs": [],
-      "source": [
-        "out = open(\"spellmapper_input.txt\", \"w\", encoding=\"utf-8\")\n",
-        "letters = list(sent)\n",
-        "candidates = get_candidates(ngram2phrases, phrases, letters, big_sample)\n",
-        "# We add two columns with targets and span_info. \n",
-        "# They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample.\n",
-        "targets = []\n",
-        "span_info = []\n",
-        "for idx, c in enumerate(candidates):\n",
-        "    if c[1] == -1:\n",
-        "        continue\n",
-        "    targets.append(str(idx + 1))  # targets are 1-based\n",
-        "    start = c[1]\n",
-        "    end = min(c[1] + c[2], len(letters))  # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation)\n",
-        "    span_info.append(\"CUSTOM \" + str(start) + \" \" + str(end))\n",
-        "\n",
-        "out.write(\" \".join(letters) + \"\\t\" + \";\".join([x[0] for x in candidates])  + \"\\t\" + \" \".join(targets) + \"\\t\" + \";\".join(span_info) + \"\\n\")\n",
-        "out.close()\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Qpei5o89SmaU"
-      },
-      "outputs": [],
-      "source": [
-        "!cat spellmapper_input.txt"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9rAmO15SS6go"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n",
-        "      pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n",
-        "      model.max_sequence_len=512 \\\n",
-        "      inference.from_file=spellmapper_input.txt \\\n",
-        "      inference.out_file=spellmapper_output.txt \\\n",
-        "      inference.batch_size=16 \\\n",
-        "      lang=en\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wd2aq4T1N5cs"
-      },
-      "source": [
-        "Each line in SpellMapper output is tab-separated and consists of 4 columns:\n",
-        "1. ASR-hypothesis (same as in input)\n",
-        "2. 10 candidates separated with semicolon (same as in input)\n",
-        "3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability)\n",
-        "4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ravgEX8cTFty"
-      },
-      "outputs": [],
-      "source": [
-        "!cat spellmapper_output.txt"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "az26364-PHb2"
-      },
-      "source": [
-        "We can use some utility functions to apply found replacements and get actual corrected text."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lPtFa_EhK8pb"
-      },
-      "outputs": [],
-      "source": [
-        "spellmapper_results = read_spellmapper_predictions(\"spellmapper_output.txt\")\n",
-        "text, replacements, _ = spellmapper_results[0]\n",
-        "corrected_text = apply_replacements_to_text(text, replacements, replace_hyphen_to_space=False)\n",
-        "print(\"Text before correction:\\n\", text)\n",
-        "print(\"Text after correction:\\n\", corrected_text)\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "efF7O-D91FLX"
-      },
-      "source": [
-        "# Bigger customization example\n",
-        "\n",
-        "Let's test customization on more data. The plan is\n",
-        "   *  Get baseline ASR transcriptions by running TTS + ASR on some medical paper abstracts.\n",
-        "   *  Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "r_EFPnyDcXZt"
-      },
-      "source": [
-        "## Run TTS"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "i9F5SBhmr8rk"
-      },
-      "outputs": [],
-      "source": [
-        "# create a folder for wav files (TTS output)\n",
-        "!rm -r audio\n",
-        "!mkdir audio"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "JMbkNVt7YBAO"
-      },
-      "outputs": [],
-      "source": [
-        "if torch.cuda.is_available():\n",
-        "  device = \"cuda\"\n",
-        "else:\n",
-        "  device = \"cpu\"\n",
-        "\n",
-        "# Load FastPitch from HuggingFace\n",
-        "spectrogram_generator = FastPitchModel.from_pretrained(\"nvidia/tts_en_fastpitch\").eval().to(device)\n",
-        "# Load HifiGan vocoder from HuggingFace\n",
-        "vocoder = HifiGanModel.from_pretrained(model_name=\"nvidia/tts_hifigan\").eval().to(device)\n",
-        "\n",
-        "# Write sentences that we want to feed to TTS\n",
-        "with open(\"tts_input.txt\", \"w\", encoding=\"utf-8\") as out:\n",
-        "    for sent, _ in sentences[0:100]:\n",
-        "        out.write(sent + \"\\n\")\n",
-        "\n",
-        "out_manifest = open(\"manifest.json\", \"w\", encoding=\"utf-8\")\n",
-        "i = 0\n",
-        "with open(\"tts_input.txt\", \"r\", encoding=\"utf-8\") as inp:\n",
-        "    for line in inp:\n",
-        "        text = line.strip()\n",
-        "        text_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", text).lower()  #replace all punctuation to space and convert to lowercase\n",
-        "        text_clean = \" \".join(text_clean.split())\n",
-        "\n",
-        "        parsed = spectrogram_generator.parse(text, normalize=True)\n",
-        "\n",
-        "        spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n",
-        "        audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n",
-        "\n",
-        "        # Note that vocoder return a batch of audio. In this example, we just take the first and only sample.\n",
-        "        filename = \"audio/\" + str(i) + \".wav\"\n",
-        "        sf.write(filename, audio.to('cpu').detach().numpy()[0], 16000)\n",
-        "        out_manifest.write(\n",
-        "            \"{\\\"audio_filepath\\\": \\\"\" + filename + \"\\\", \\\"text\\\": \\\"\" + text_clean + \"\\\", \\\"orig_text\\\": \\\"\" + text + \"\\\"}\\n\"\n",
-        "        )\n",
-        "        i += 1\n",
-        "\n",
-        "        # display some examples\n",
-        "        if i < 10:\n",
-        "            print(f'\"{text}\"\\n')\n",
-        "            ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=22050))\n",
-        "\n",
-        "out_manifest.close()\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9T3CZcCAmxCz"
-      },
-      "source": [
-        "Now we have a folder with generated audios `audio/*.wav` and a nemo manifest with json records like `{\"audio_filepath\": \"audio/0.wav\", \"text\": \"no renal auditory or vestibular toxicity was observed\", \"orig_text\": \"No renal, auditory, or vestibular toxicity was observed.\"}`.",
-        "\n",
-        "Note that TTS model may mispronounce some unknown words, for example, abbreviations like `tRNAs`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "pR_T1HnttVjm"
-      },
-      "outputs": [],
-      "source": [
-        "lines = []\n",
-        "with open(\"manifest.json\", \"r\", encoding=\"utf-8\") as f:\n",
-        "    lines = f.readlines()\n",
-        "\n",
-        "for line in lines:\n",
-        "    try:\n",
-        "        data = json.loads(line.strip())\n",
-        "    except:\n",
-        "        print(line)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bt2TMLLvdUHm"
-      },
-      "source": [
-        "Free GPU memory to avoid OOM."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ZwEpAOCaRH7s"
-      },
-      "outputs": [],
-      "source": [
-        "del spectrogram_generator\n",
-        "del vocoder\n",
-        "torch.cuda.empty_cache()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HrensakWdLkt"
-      },
-      "source": [
-        "## Run baseline ASR"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IQNIo2M_mqJc"
-      },
-      "source": [
-        "Next we transcribe our .wav files with a general domain [ASR model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large). It will generate an output file `ctc_baseline_transcript.json` where the predicted transcriptions are stored in the field `pred_text` of each record.\n",
-        "\n",
-        "Note that this ASR model was not trained or fine-tuned on medical domain, so we expect it to make mistakes on medical terms."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NMN63ux1mJiG"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/transcribe_speech.py \\\n",
-        "      pretrained_name=\"stt_en_conformer_ctc_large\" \\\n",
-        "      dataset_manifest=manifest.json \\\n",
-        "      output_filename=ctc_baseline_transcript_tmp.json \\\n",
-        "      batch_size=2"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "L3swQ8uqqgnp"
-      },
-      "source": [
-        "ATTENTION: SpellMapper relies on words to be separated by _single_ space\n",
-        "\n",
-        "There is a bug with multiple space, observed in ASR results produced by Conformer-CTC, probably connected to this issue: https://github.com/NVIDIA/NeMo/issues/4034.\n",
-        "\n",
-        "So we need to correct the manifests to ensure that all spaces are single."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "z17sxkmXrXpJ"
-      },
-      "outputs": [],
-      "source": [
-        "test_data = read_manifest(\"ctc_baseline_transcript_tmp.json\")\n",
-        "\n",
-        "for i in range(len(test_data)):\n",
-        "    # if there are multiple spaces in the string they will be merged to one\n",
-        "    test_data[i][\"pred_text\"] = \" \".join(test_data[i][\"pred_text\"].split())\n",
-        "\n",
-        "with open(\"ctc_baseline_transcript.json\", \"w\", encoding=\"utf-8\") as out:\n",
-        "    for d in test_data:\n",
-        "        line = json.dumps(d)\n",
-        "        out.write(line + \"\\n\")\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "PuKtfhbVkVJY"
-      },
-      "outputs": [],
-      "source": [
-        "!head -n 4 ctc_baseline_transcript.json"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "aCJw9NEXqRg8"
-      },
-      "source": [
-        "### Calculating WER of baseline transcript\n",
-        "We use the standard script from NeMo to calculate WER and CER of our baseline transcript. Internally it compares the text in `pred_text` (predicted transcript) to `text` (reference transcript). "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ZmNEGVWQsGo2"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/speech_to_text_eval.py \\\n",
-        "  dataset_manifest=ctc_baseline_transcript.json \\\n",
-        "  only_score_manifest=True\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "AvPwJr0ZqdkN"
-      },
-      "source": [
-        "### See fragments that differ\n",
-        "We use SequenceMatcher to see fragments that differ. (Another option is to use a more powerful analytics tool [Speech Data Explorer](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RAeaVCpMv78y"
-      },
-      "outputs": [],
-      "source": [
-        "test_data = read_manifest(\"ctc_baseline_transcript.json\")\n",
-        "pred_text = [data['pred_text'] for data in test_data]\n",
-        "ref_text = [data['text'] for data in test_data]\n",
-        "audio_filepath = [data['audio_filepath'] for data in test_data]\n",
-        "\n",
-        "diff_vocab = Counter()\n",
-        "\n",
-        "for i in range(len(test_data)):\n",
-        "    ref_sent = \" \" + ref_text[i] + \" \"\n",
-        "    pred_sent = \" \" + pred_text[i] + \" \"\n",
-        "\n",
-        "    pred_words = pred_sent.strip().split()\n",
-        "    ref_words = ref_sent.strip().split()\n",
-        "\n",
-        "    for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n",
-        "        if tag != \"equal\":\n",
-        "            diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n",
-        "\n",
-        "sum_ = 0\n",
-        "print(\"PRED vs REF\")\n",
-        "for k, v in diff_vocab.most_common(1000000):\n",
-        "    sum_ += v\n",
-        "    print(k, v, \"sum=\", sum_)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dUSOF7iD1w_9"
-      },
-      "source": [
-        "## Run SpellMapper"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "x39BQhYB6_Fr"
-      },
-      "source": [
-        "Now we run retrieval on our input manifest and prepare input for SpellMapper inference. Note that we use index of custom vocabulary (file `index.txt` that we saved earlier)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "y8x-yT5WqfFz"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \\\n",
-        "  --manifest ctc_baseline_transcript.json \\\n",
-        "  --custom_vocab_index index.txt \\\n",
-        "  --big_sample spellmapper_asr_customization_en/big_sample.txt \\\n",
-        "  --short2full_name short2full.txt \\\n",
-        "  --output_name spellmapper_input.txt"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ueq_JAPWGs_Y"
-      },
-      "source": [
-        "Run the inference."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "zgkqiiZtJjcB"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n",
-        "      pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n",
-        "      model.max_sequence_len=512 \\\n",
-        "      inference.from_file=spellmapper_input.txt \\\n",
-        "      inference.out_file=spellmapper_output.txt \\\n",
-        "      inference.batch_size=16 \\\n",
-        "      lang=en\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RPQWJX8dFLfX"
-      },
-      "source": [
-        "Now we postprocess SpellMapper output and create output corrected manifest."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "3eFU515yKvXP"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n",
-        "  --input_manifest ctc_baseline_transcript.json \\\n",
-        "  --short2full_name short2full.txt \\\n",
-        "  --output_manifest ctc_corrected_transcript.json \\\n",
-        "  --spellmapper_result spellmapper_output.txt \\\n",
-        "  --replace_hyphen_to_space \\\n",
-        "  --field_name pred_text \\\n",
-        "  --ngram_mappings \"\"\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hRoIhhGh17tp"
-      },
-      "source": [
-        "### Calculating WER of corrected transcript."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qIT957bGo9AY"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/speech_to_text_eval.py \\\n",
-        "  dataset_manifest=ctc_corrected_transcript.json \\\n",
-        "  only_score_manifest=True\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NYXIPusupqOQ"
-      },
-      "outputs": [],
-      "source": [
-        "test_data = read_manifest(\"ctc_corrected_transcript.json\")\n",
-        "pred_text = [data['pred_text'] for data in test_data]\n",
-        "ref_text = [data['pred_text_before_correction'] for data in test_data]\n",
-        "\n",
-        "diff_vocab = Counter()\n",
-        "\n",
-        "for i in range(len(test_data)):\n",
-        "    ref_sent = \" \" + ref_text[i] + \" \"\n",
-        "    pred_sent = \" \" + pred_text[i] + \" \"\n",
-        "\n",
-        "    pred_words = pred_sent.strip().split()\n",
-        "    ref_words = ref_sent.strip().split()\n",
-        "\n",
-        "    for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n",
-        "        if tag != \"equal\":\n",
-        "            diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n",
-        "\n",
-        "sum_ = 0\n",
-        "print(\"Corrected vs baseline\")\n",
-        "for k, v in diff_vocab.most_common(1000000):\n",
-        "    sum_ += v\n",
-        "    print(k, v, \"sum=\", sum_)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DJtXlqXbTD6M"
-      },
-      "source": [
-        "### Filtering by Dynamic Programming(DP) score\n",
-        "\n",
-        "What else can be done?\n",
-        "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probabilities of matching n-grams along this path.\n",
-        "Let's look at an example. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "05Qf9wgHU_UR"
-      },
-      "outputs": [],
-      "source": [
-        "joint_vocab, orig_vocab, misspelled_vocab, max_len = load_ngram_mappings_for_dp(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\")\n",
-        "\n",
-        "fragment = \"and hydrod\"\n",
-        "replacement = \"anhydride\"\n",
-        "fragment_spaced = \" \".join(list(fragment.replace(\" \", \"_\")))\n",
-        "replacement_spaced = \" \".join(list(replacement.replace(\" \", \"_\")))\n",
-        "path = get_alignment_by_dp(\n",
-        "    replacement_spaced,\n",
-        "    fragment_spaced,\n",
-        "    dp_data=(joint_vocab, orig_vocab, misspelled_vocab, max_len)\n",
-        ")\n",
-        "print(\"Dynamic Programming path:\")\n",
-        "for fragment_ngram, replacement_ngram, score, sum_score, joint_freq, orig_freq, misspelled_freq in path:\n",
-        "    print(\n",
-        "        \"\\t\",\n",
-        "        \"frag=\",\n",
-        "        fragment_ngram,\n",
-        "        \"; repl=\",\n",
-        "        replacement_ngram,\n",
-        "        \"; score=\",\n",
-        "        score,\n",
-        "        \"; sum_score=\",\n",
-        "        sum_score,\n",
-        "        \"; joint_freq=\",\n",
-        "        joint_freq,\n",
-        "        \"; orig_freq=\",\n",
-        "        orig_freq,\n",
-        "        \"; misspelled_freq=\",\n",
-        "        misspelled_freq,\n",
-        "    )\n",
-        "\n",
-        "print(\"Final path score is in path[-1][3]: \", path[-1][3])\n",
-        "print(\"Dynamic programming(DP) score per symbol is final score divided by len(fragment): \", path[-1][3] / (len(fragment)))\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hgfKPKckaLnc"
-      },
-      "source": [
-        "The idea is that we can skip replacements whose average DP score per symbol is below some predefined minimum, say -1.5.\n",
-        "Note that dynamic programming works slow because of quadratic complexity, but it allows to get rid of some false positives. Let's apply it on the same test set."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "UhSXh7ht_JRn"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n",
-        "  --input_manifest ctc_baseline_transcript.json \\\n",
-        "  --short2full_name short2full.txt \\\n",
-        "  --output_manifest ctc_corrected_transcript_dp.json \\\n",
-        "  --spellmapper_result spellmapper_output.txt \\\n",
-        "  --replace_hyphen_to_space \\\n",
-        "  --field_name pred_text \\\n",
-        "  --use_dp \\\n",
-        "  --ngram_mappings spellmapper_asr_customization_en/replacement_vocab_filt.txt \\\n",
-        "  --min_dp_score_per_symbol -1.5"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "u8R5YHB3vPC8"
-      },
-      "outputs": [],
-      "source": [
-        "!python nemo/examples/asr/speech_to_text_eval.py \\\n",
-        "  dataset_manifest=ctc_corrected_transcript_dp.json \\\n",
-        "  only_score_manifest=True"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "upvTbkFAeYtR"
-      },
-      "source": [
-        "# Final notes\n",
-        "1. Bash-script with example of inference pipeline [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_infer.sh)\n",
-        "\n",
-        "2. Check our paper: [SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings](https://arxiv.org/abs/2306.02317)\n",
-        "\n",
-        "3. To reproduce evaluation experiments from this paper see these scripts:\n",
-        " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n",
-        " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n",
-        " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n",
-        "\n",
-        "4. To reproduce creation of training data see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n",
-        "\n",
-        "5. To run training see [run_training.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_training.sh)\n",
-        "\n",
-        "6. Promising future research directions would be:\n",
-        "  - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n",
-        "  - retrain with adding more various false positives to the training data"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}

From ebba8b14263ca513c4453fcde0472785c19f46c1 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Mon, 10 Jun 2024 15:36:17 -0700
Subject: [PATCH 02/25] Add Dev Container Bug Report (#9430)

* Add dev_container_bug_report.md

Signed-off-by: Pablo Garay <palenq@gmail.com>

* Date field refactor

---------

Signed-off-by: Pablo Garay <palenq@gmail.com>
---
 .../dev_container_bug_report.md               | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/dev_container_bug_report.md

diff --git a/.github/ISSUE_TEMPLATE/dev_container_bug_report.md b/.github/ISSUE_TEMPLATE/dev_container_bug_report.md
new file mode 100644
index 000000000000..fe81ec6252d8
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/dev_container_bug_report.md
@@ -0,0 +1,35 @@
+---
+container pulled on date: mm/dd/yyyy
+name: Dev container - Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+
+A clear and concise description of what the bug is.
+
+**Steps/Code to reproduce bug**
+
+Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
+
+A  helpful guide on on how to craft a minimal bug report  http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports. 
+
+
+**Expected behavior**
+
+A clear and concise description of what you expected to happen.
+
+**Environment overview (please complete the following information)**
+
+ - Environment location: Docker
+ - Method of install: Please specify exact commands you used to install.
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
+
+**Additional context**
+
+Add any other context about the problem here.
+Example: GPU model

From 97aa7322a5de430a908f4bcafac371521c3116c0 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 11 Jun 2024 16:27:08 +0200
Subject: [PATCH 03/25] Enable specyfing alpha for SQ (#9423)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_quantization.yaml | 1 +
 nemo/export/quantize/quantizer.py                              | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/examples/nlp/language_modeling/conf/megatron_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
index 88d10ae0a66c..52454f5c8906 100644
--- a/examples/nlp/language_modeling/conf/megatron_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_quantization.yaml
@@ -26,6 +26,7 @@ quantization:
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
   awq_block_size: 128 # block size for scaling factors in AWQ algorithm
+  alpha: 1.0 # alpha parameter in SmoothQuant algorithm
 
 export:
   decoder_type: llama # gptnext, gpt2, llama
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 4748f4957a52..e25d529ec62c 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -116,6 +116,9 @@ def __init__(
                 "axis": None,
                 "enable": enable_quant_kv_cache,
             }
+            if quantization_config.algorithm == "int8_sq":
+                logging.info(f"Using int8_sq alpha = {quantization_config.alpha}")
+                quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": quantization_config.alpha}
 
             self.quant_cfg = quant_cfg
         else:

From 91ab412e484e29cf9ebe0286c428281b8e599523 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Tue, 11 Jun 2024 18:27:07 +0300
Subject: [PATCH 04/25] add support for new mcore ds features (#9388)

* add validation_drop_last and add_extra_token params support for mcore ds

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* pad samples with dummy tokens only

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* use no_seqlen_plus_one_input_tokens as mcore's add_extra_token

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* set train_valid_test_num_samples[1] to None

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add test case when validation_drop_last is False

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* set validation_drop_last as True by default

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Update nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py

Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py

Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml                             | 2 ++
 .../nlp/data/language_modeling/megatron/data_samplers.py    | 5 ++---
 .../nlp/models/language_modeling/megatron_gpt_model.py      | 6 ++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 01a8cfc4b0df..6cf60271e0d7 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -2398,6 +2398,7 @@ jobs:
             model.activations_checkpoint_method=block \
             model.activations_checkpoint_granularity=full \
             model.activations_checkpoint_num_layers=1 \
+            model.data.validation_drop_last=False \
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
             model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
 
@@ -2432,6 +2433,7 @@ jobs:
             model.activations_checkpoint_method=block \
             model.activations_checkpoint_granularity=full \
             model.activations_checkpoint_num_layers=1 \
+            model.data.validation_drop_last=False \
             model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
             model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
         
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
index 6818f99d0e4f..4a8b989a7b6d 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
@@ -91,8 +91,7 @@ def __len__(self):
             return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
     @abc.abstractmethod
-    def __iter__(self):
-        ...
+    def __iter__(self): ...
 
 
 class MegatronPretrainingSampler(BaseMegatronSampler):
@@ -107,7 +106,7 @@ def __iter__(self):
         indices = range(self.consumed_samples, self.total_samples)
         if (not self.drop_last) and self.pad_samples_to_global_batch_size:
             pad_samples_num = -len(indices) % self.global_batch_size
-            pad_indices = range(-1, -pad_samples_num - 1, -1)
+            pad_indices = [None] * pad_samples_num
             indices = chain(indices, pad_indices)
 
         for idx in indices:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 718991dc203d..8cb8d95150c9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1472,8 +1472,7 @@ def build_train_valid_test_datasets(self):
         # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below).
         # Setting N = 1 we force E to be 1 as well
         if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[1] = 1
-
+            train_valid_test_num_samples[1] = None
         # Add extra FIM tokens to tokenizer
         if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron':
             fim_tokens = self.cfg.data.fim.extra_tokens
@@ -1498,6 +1497,7 @@ def build_train_valid_test_datasets(self):
             is_dataset_built_on_rank = lambda: True
 
             mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
+            add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False)
             kwargs = {
                 "random_seed": self.cfg.seed,
                 "sequence_length": self.cfg.data.seq_length,
@@ -1508,6 +1508,8 @@ def build_train_valid_test_datasets(self):
                 "eod_mask_loss": self.eod_mask_loss,
                 "create_attention_mask": not self.get_attention_mask_from_fusion,
                 "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True),
+                "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True),
+                "add_extra_token_to_sequence": add_extra_token,
             }
 
             data_prefix = self.cfg.data.data_prefix

From df5f8cb0a16caadf319f8ebe96c2199fcb8594b2 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 11 Jun 2024 10:54:14 -0700
Subject: [PATCH 05/25] Akoumparouli/profiling docs (#9420)

* profiling docs

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix docstring

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 docs/source/core/core.rst    |  32 +++++++
 nemo/core/classes/modelPT.py | 181 ++++++++++++++++++-----------------
 2 files changed, 127 insertions(+), 86 deletions(-)

diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index 1c9325cf0a96..3c1a496993bd 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -741,3 +741,35 @@ To register a child model, use the ``register_nemo_submodule`` method of the par
             else:
                 self.child_model = None
 
+
+
+Profiling 
+---------
+
+NeMo offers users two options for profiling: Nsys & CUDA memory profiling. These two options allow users
+to debug performance issues as well as memory issues such as memory leaks.
+
+To enable Nsys profiling, add the following options to the model config:
+nsys_profile: False
+   start_step: 10  # Global batch to start profiling
+   end_step: 10 # Global batch to end profiling
+   ranks: [0] # Global rank IDs to profile
+   gen_shape: False # Generate model and kernel details including input shapes
+
+Finally, the model training script with:
+
+nsys profile -s none -o <profile filepath> -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... 
+See more options at `nsight user guide <https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling>`_.
+
+
+
+To enable CUDA memory profiling, add the following options to the model config:
+
+memory_profile:
+   enabled: True
+   start_step: 10  # Global batch to start profiling
+   end_step: 10 # Global batch to end profiling
+   rank: 0 # Global rank ID to profile
+   output_path: None # Path to store the profile output file
+
+And invoke your NeMo script without any changes in the invocation command.
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 0a9054c23da8..f5d61a8edb15 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -220,37 +220,40 @@ def on_fit_start(self) -> None:
         return super().on_fit_start()
 
     def register_artifact(
-        self, config_path: str, src: str, verify_src_exists: bool = True,
+        self,
+        config_path: str,
+        src: str,
+        verify_src_exists: bool = True,
     ):
-        """ Register model artifacts with this function. These artifacts (files) will be included inside .nemo file
-            when model.save_to("mymodel.nemo") is called.
+        """Register model artifacts with this function. These artifacts (files) will be included inside .nemo file
+        when model.save_to("mymodel.nemo") is called.
 
-            How it works:
+        How it works:
 
-            1. It always returns existing absolute path which can be used during Model constructor call
-                EXCEPTION: src is None or "" in which case nothing will be done and src will be returned
-            2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts
+        1. It always returns existing absolute path which can be used during Model constructor call
+            EXCEPTION: src is None or "" in which case nothing will be done and src will be returned
+        2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts
 
-                .. code-block::
+            .. code-block::
 
-                    If "src" is local existing path:
-                        then it will be returned in absolute path form.
-                    elif "src" starts with "nemo_file:unique_artifact_name":
-                        .nemo will be untarred to a temporary folder location and an actual existing path will be returned
-                    else:
-                        an error will be raised.
+                If "src" is local existing path:
+                    then it will be returned in absolute path form.
+                elif "src" starts with "nemo_file:unique_artifact_name":
+                    .nemo will be untarred to a temporary folder location and an actual existing path will be returned
+                else:
+                    an error will be raised.
 
-            WARNING: use .register_artifact calls in your models' constructors.
-            The returned path is not guaranteed to exist after you have exited your model's constructor.
+        WARNING: use .register_artifact calls in your models' constructors.
+        The returned path is not guaranteed to exist after you have exited your model's constructor.
 
-            Args:
-                config_path (str): Artifact key. Usually corresponds to the model config.
-                src (str): Path to artifact.
-                verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if
-                                          src is not found. Defaults to True.
+        Args:
+            config_path (str): Artifact key. Usually corresponds to the model config.
+            src (str): Path to artifact.
+            verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if
+                                      src is not found. Defaults to True.
 
-            Returns:
-                str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model instance life
+        Returns:
+            str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model instance life
         """
 
         if src is None or src == "":
@@ -610,7 +613,9 @@ def setup_megatron_optimization(self, optim_config: Union[Dict[str, Any], DictCo
         return megatron_optim_config
 
     def setup_optimization(
-        self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        optim_config: Optional[Union[DictConfig, Dict]] = None,
+        optim_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Prepares an optimizer from a string name and its optional config parameters.
 
@@ -760,7 +765,10 @@ def setup_optimization(
             if optimizer_name == 'mcore_distributed_optim':
                 # setup megatron_optim_config and get Mcore based optimizer with the wrapper
                 megatron_optim_config = self.setup_megatron_optimization(optimizer_args)
-                _megatron_optimizer = get_megatron_optimizer(megatron_optim_config, self.model,)
+                _megatron_optimizer = get_megatron_optimizer(
+                    megatron_optim_config,
+                    self.model,
+                )
                 optimizer = McoreDistributedOptimizer(_megatron_optimizer)
 
             else:
@@ -781,30 +789,30 @@ def setup_optimization(
 
     def setup_optimizer_param_groups(self):
         """
-            Used to create param groups for the optimizer.
-            As an example, this can be used to specify per-layer learning rates:
-
-            optim.SGD([
-                        {'params': model.base.parameters()},
-                        {'params': model.classifier.parameters(), 'lr': 1e-3}
-                        ], lr=1e-2, momentum=0.9)
-
-            See https://pytorch.org/docs/stable/optim.html for more information.
-            By default, ModelPT will use self.parameters().
-            Override this method to add custom param groups.
-            In the config file, add 'optim_param_groups' to support different LRs
-            for different components (unspecified params will use the default LR):
-
-            model:
-                optim_param_groups:
-                    encoder:
-                        lr: 1e-4
-                        momentum: 0.8
-                    decoder:
-                        lr: 1e-3
-                optim:
-                    lr: 3e-3
-                    momentum: 0.9
+        Used to create param groups for the optimizer.
+        As an example, this can be used to specify per-layer learning rates:
+
+        optim.SGD([
+                    {'params': model.base.parameters()},
+                    {'params': model.classifier.parameters(), 'lr': 1e-3}
+                    ], lr=1e-2, momentum=0.9)
+
+        See https://pytorch.org/docs/stable/optim.html for more information.
+        By default, ModelPT will use self.parameters().
+        Override this method to add custom param groups.
+        In the config file, add 'optim_param_groups' to support different LRs
+        for different components (unspecified params will use the default LR):
+
+        model:
+            optim_param_groups:
+                encoder:
+                    lr: 1e-4
+                    momentum: 0.8
+                decoder:
+                    lr: 1e-3
+            optim:
+                lr: 3e-3
+                momentum: 0.9
         """
         if not hasattr(self, "parameters"):
             self._optimizer_param_groups = None
@@ -1710,26 +1718,27 @@ def update_save_restore_connector(cls, save_restore_connector):
             setattr(cls, '_save_restore_connector', save_restore_connector)
 
     def _setup_profiling(self):
-        """ Enables nsys profiling
-            To use, add the following optoins to the model config:
-            ## Nsys profiling options
-            nsys_profile: False
-                start_step: 10  # Global batch to start profiling
-                end_step: 10 # Global batch to end profiling
-                ranks: [0] # Global rank IDs to profile
-                gen_shape: False # Generate model and kernel details including input shapes
-            And then wrap the model training script with:
-            nsys profile -s none -o <profile filepath>  -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/...
-            See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling
-
-            Enables CUDA memory profiling
-            To use, add the following optoins to the model config:
-            ## CUDA memory profiling options
-            memory_profile: False
-                start_step: 10  # Global batch to start profiling
-                end_step: 10 # Global batch to end profiling
-                rank: 0 # Global rank ID to profile
-                output_path: None # Path to store the profile output file
+        """Enables nsys profiling
+        To use, add the following optoins to the model config:
+        ## Nsys profiling options
+        nsys_profile: False
+            start_step: 10  # Global batch to start profiling
+            end_step: 10 # Global batch to end profiling
+            ranks: [0] # Global rank IDs to profile
+            gen_shape: False # Generate model and kernel details including input shapes
+        And then wrap the model training script with:
+        nsys profile -s none -o <profile filepath>  -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/...
+        See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling
+
+        Enables CUDA memory profiling
+        To use, add the following options to the model config:
+        ## CUDA memory profiling options
+        memory_profile:
+            enabled: True
+            start_step: 10  # Global batch to start profiling
+            end_step: 10 # Global batch to end profiling
+            rank: 0 # Global rank ID to profile
+            output_path: None # Path to store the profile output file
         """
         if self.cfg.get('nsys_profile', None) is not None:
             if self.cfg.nsys_profile.get('enabled', False):
@@ -1791,9 +1800,9 @@ def _setup_profiling(self):
                     )
 
     def on_train_start(self):
-        """ PyTorch Lightning hook:
-            https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start
-            We use it here to copy the relevant config for dynamic freezing.
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start
+        We use it here to copy the relevant config for dynamic freezing.
         """
 
         # dynamic freezing
@@ -1810,9 +1819,9 @@ def on_train_start(self):
                 setattr(self, '_freeze_cfg', None)
 
     def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> Optional[int]:
-        """ PyTorch Lightning hook:
-            https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start
-            We use it here to enable nsys profiling and dynamic freezing.
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start
+        We use it here to enable nsys profiling and dynamic freezing.
         """
 
         # nsys profiling
@@ -1856,9 +1865,9 @@ def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> O
                         self._freeze_cfg['is_frozen'][ml] = False
 
     def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int = 0) -> None:
-        """ PyTorch Lightning hook:
-            https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end
-            We use it here to enable nsys profiling.
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end
+        We use it here to enable nsys profiling.
         """
 
         if self.device.type == 'cuda':
@@ -1893,30 +1902,30 @@ def _cleanup_on_execution_end(self):
         self._test_step_outputs = None
 
     def on_train_end(self):
-        """ PyTorch Lightning hook:
-            https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end
-            We use it here to cleanup the dynamic freezing config.
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end
+        We use it here to cleanup the dynamic freezing config.
         """
 
         self._cleanup_on_execution_end()
 
     def on_test_end(self):
-        """ PyTorch Lightning hook:
-            https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end
         """
 
         self._cleanup_on_execution_end()
 
     def on_predict_end(self):
-        """ PyTorch Lightning hook:
-            https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end
         """
 
         self._cleanup_on_execution_end()
 
     # TODO: Remove in PTL 1.7.2
     def cuda(self, device=None):
-        """ PTL is overriding this method and changing the pytorch behavior of a module.
+        """PTL is overriding this method and changing the pytorch behavior of a module.
             The PTL LightingModule override will move the module to device 0 if device is None.
             See the PTL method here: https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/mixins/device_dtype_mixin.py#L113
 

From c51cdbb5d2ab8e99cb48d621cc33706931b13a7f Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Tue, 11 Jun 2024 15:55:01 -0400
Subject: [PATCH 06/25] LoRA for MoE Layer (#9396)

* initial moe lora impl

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>

* fix dangling adapter

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update to newest mcore code

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: cuichenx <cuichenx@users.noreply.github.com>
Co-authored-by: cuichenx <cuichenx@users.noreply.github.com>
---
 .../common/megatron/adapters/mcore_mixins.py  | 73 ++++++++++++---
 .../megatron/adapters/parallel_adapters.py    | 88 +++++++++++++++++--
 nemo/collections/nlp/parts/peft_config.py     | 40 +++++++--
 3 files changed, 173 insertions(+), 28 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index a85c155cc0a8..bcfe07f702a0 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -14,19 +14,16 @@
 
 import torch
 import torch.nn.functional as F
-from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl
 from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
 from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
+from megatron.core.tensor_parallel import ColumnParallelLinear
 from megatron.core.transformer.attention import SelfAttention
-from megatron.core.transformer.custom_layers.transformer_engine import (
-    SplitAlongDim,
-    TEColumnParallelLinear,
-    TELayerNormColumnParallelLinear,
-)
+from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim
 from megatron.core.transformer.mlp import MLP
+from megatron.core.transformer.moe.experts import SequentialMLP
 from megatron.core.transformer.transformer_layer import TransformerLayer
 from megatron.core.utils import make_viewless_tensor
 
@@ -37,6 +34,8 @@
     LoraDenseAttentionAdapterConfig,
     LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
+    LoraMoe4HtoHAdapterConfig,
+    LoraMoeHto4HAdapterConfig,
     LoraUnfusedHto4HAdapterConfig,
     LoraUnfusedKQVAdapterConfig,
     MLPInfusedAdapterConfig,
@@ -281,13 +280,15 @@ def forward(
 class MCoreMLPMixin(MLP, MCoreAdapterModuleMixin):
     def mcore_register_adapters(self):
         """
-        Setup NeMo IA3 adapter to this MCore layer.
+        Setup NeMo IA3 and LoRA adapter to this MCore layer.
         """
         self.set_accepted_adapter_types(
             [
                 LoraUnfusedHto4HAdapterConfig._target_,
                 LoraHto4HAdapterConfig._target_,
                 Lora4HtoHAdapterConfig._target_,
+                LoraMoeHto4HAdapterConfig._target_,
+                LoraMoe4HtoHAdapterConfig._target_,
                 MLPInfusedAdapterConfig._target_,
             ]
         )  # only self attn (packed qkv) for now
@@ -302,9 +303,12 @@ def mcore_register_adapters(self):
             # overlap is used.
             self.linear_fc1.return_layernorm_output_gathered = True
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, expert_idx=None):
         # [s, b, 4 * h/p]
-        if self.linear_fc1.te_return_bias:
+        if isinstance(self.linear_fc1, ColumnParallelLinear):
+            layernorm_output = hidden_states
+            intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
+        elif self.linear_fc1.te_return_bias:
             intermediate_parallel, bias_parallel, layernorm_output = self.linear_fc1(hidden_states)
         else:
             # bias_parallel is None
@@ -315,15 +319,19 @@ def forward(self, hidden_states):
             lora_adapter = None
             lora_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER)
             lora_unfused_fc1_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_Hto4H_ADAPTER)
+            lora_moe_fc1_adapter = self.get_adapter_module(AdapterName.LORA_MOE_Hto4H_ADAPTER)
             if lora_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']:
                 lora_adapter = lora_fc1_adapter
             if lora_unfused_fc1_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_Hto4H_ADAPTER]['enabled']:
                 assert lora_adapter is None, "Expected only one of LORA_Hto4H_ADAPTER or LORA_UNFUSED_Hto4H_ADAPTER"
                 lora_adapter = lora_unfused_fc1_adapter
 
+            lora_output = 0
             if lora_adapter:
                 lora_output = lora_adapter(layernorm_output)
-                intermediate_parallel = intermediate_parallel + lora_output
+            elif lora_moe_fc1_adapter and self.adapter_cfg[AdapterName.LORA_MOE_Hto4H_ADAPTER]['enabled']:
+                lora_output = lora_moe_fc1_adapter(layernorm_output, expert_idx)
+            intermediate_parallel = intermediate_parallel + lora_output
 
         if self.config.bias_activation_fusion:
             if self.activation_func == F.gelu:
@@ -363,14 +371,51 @@ def glu(x):
 
         # LoRA logic
         if self.is_adapter_available():
-            lora_linear_fc2_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER)
-            if lora_linear_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']:
-                lora_output = lora_linear_fc2_adapter(intermediate_parallel)
-                output = output + lora_output
+            lora_fc2_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER)
+            lora_moe_fc2_adapter = self.get_adapter_module(AdapterName.LORA_MOE_4HtoH_ADAPTER)
+
+            lora_output = 0
+            if lora_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']:
+                lora_output = lora_fc2_adapter(intermediate_parallel)
+            elif lora_moe_fc2_adapter and self.adapter_cfg[AdapterName.LORA_MOE_4HtoH_ADAPTER]['enabled']:
+                lora_output = lora_moe_fc2_adapter(intermediate_parallel, expert_idx)
+
+            output = output + lora_output
 
         return output, output_bias
 
 
+class MCoreSequentialMLPMixin(SequentialMLP, MCoreAdapterModuleMixin):
+    def mcore_register_adapters(self):
+        """
+        We don't want the SequentialMLP layer to take any adapters. We only want to override the forward() behavior
+        """
+        pass
+
+    def forward(self, permuted_local_hidden_states, tokens_per_expert):
+        output_local = torch.zeros_like(permuted_local_hidden_states)
+        output_bias_local = None
+        if self.add_bias:
+            output_bias_local = torch.zeros_like(permuted_local_hidden_states)
+
+        cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0)
+        # Insert zero at the begining for offset index's convenience
+        zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device)
+        cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens))
+        for expert_num, expert in enumerate(self.local_experts):
+            start = cumsum_num_tokens[expert_num]
+            end = cumsum_num_tokens[expert_num + 1]
+            hidden = permuted_local_hidden_states[start:end]
+            output, output_bias = expert(hidden, expert_num)  # expert: MLP
+
+            output_local[start:end] = output
+            if self.add_bias:
+                output_bias = output_bias.expand_as(output)
+                output_bias_local[start:end, :] = output_bias
+
+        return output_local, output_bias_local
+
+
 class MCoreGPTEmbeddingMixin(LanguageModelEmbedding, MCoreAdapterModuleMixin):
     def mcore_register_adapters(self):
         """
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 61903e6b3673..21dace008877 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -83,6 +83,8 @@ class AdapterName(str, enum.Enum):
     LORA_Hto4H_ADAPTER = "lora_hto4h_adapter"
     LORA_UNFUSED_Hto4H_ADAPTER = "lora_unfused_hto4h_adapter"
     LORA_4HtoH_ADAPTER = "lora_4htoh_adapter"
+    LORA_MOE_Hto4H_ADAPTER = "lora_moe_hto4h_adapter"
+    LORA_MOE_4HtoH_ADAPTER = "lora_moe_4htoh_adapter"
     MULTIMODAL_PROJECTOR_ADAPTER = "mm_projector_adapter"
     PARALLEL_LINEAR_ADAPTER = "parallel_linear_adapter"
 
@@ -611,6 +613,80 @@ class LoraUnfusedKQVAdapterConfig(AdapterConfig):
     _target_: str = "{0}.{1}".format(LoraUnfusedKQVAdapter.__module__, LoraUnfusedKQVAdapter.__name__)
 
 
+class LoraMoeAdapter(nn.Module, AdapterModuleUtil):
+    def __init__(
+        self,
+        num_moe_experts: int,
+        in_features: int,
+        out_features: int,
+        dim: int,
+        activation: str = 'identity',
+        norm_position: Optional[str] = None,
+        norm_type: Optional[str] = None,
+        column_init_method: str = 'xavier',
+        row_init_method: str = 'zero',
+        gather_output: bool = False,
+        input_is_parallel: bool = False,
+        dropout: float = 0.0,
+        model_parallel_config: Optional[ModelParallelConfig] = None,
+        alpha: float | None = None,
+        dropout_position: str = 'post',
+        a2a_experimental: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.num_moe_experts = num_moe_experts
+        adapter_args = {
+            "in_features": in_features,
+            "out_features": out_features,
+            "dim": dim,
+            "activation": activation,
+            "norm_position": norm_position,
+            "norm_type": norm_type,
+            "column_init_method": column_init_method,
+            "row_init_method": row_init_method,
+            "gather_output": gather_output,
+            "input_is_parallel": input_is_parallel,
+            "dropout": dropout,
+            "model_parallel_config": model_parallel_config,
+            "alpha": alpha,
+            "dropout_position": dropout_position,
+            "a2a_experimental": a2a_experimental,
+        }
+        self.expert_adapters = nn.ModuleList()
+        for i in range(num_moe_experts):
+            self.expert_adapters.append(ParallelLinearAdapter(**adapter_args))
+
+    def forward(self, x, expert_idx):
+        return self.expert_adapters[expert_idx](x)
+
+
+@dataclass
+class LoraMoeHto4HAdapterConfig(AdapterConfig):
+    num_moe_experts: int
+    in_features: int
+    out_features: int
+    dim: int
+    activation: str = 'identity'
+    norm_position: Optional[str] = None
+    norm_type: Optional[str] = None
+    column_init_method: str = 'xavier'
+    row_init_method: str = 'zero'
+    gather_output: bool = False
+    input_is_parallel: bool = False
+    dropout: float = 0.0
+    dropout_position: str = 'post'
+    alpha: float | None = None
+    a2a_experimental: bool = False
+    _target_: str = "{0}.{1}".format(LoraMoeAdapter.__module__, LoraMoeAdapter.__name__)
+
+
+@dataclass
+class LoraMoe4HtoHAdapterConfig(LoraMoeHto4HAdapterConfig):
+    input_is_parallel: bool = True
+
+
 class PromptEncoderAdapter(nn.Module, AdapterModuleUtil):
     """
     The Tensor Parallel MLP prompt encoder network that is used to generate the virtual
@@ -690,20 +766,14 @@ def set_inference_table(self, prompt_representation: torch.Tensor):
         self.is_inference_ready = True
         return True
 
-    def clear_inference_table(
-        self,
-    ):
+    def clear_inference_table(self):
         self.inference_table.fill_(0.0)
         self.is_inference_ready = False
 
-    def get_inference_table(
-        self,
-    ):
+    def get_inference_table(self):
         return self.inference_table.data
 
-    def inner_forward(
-        self,
-    ):
+    def inner_forward(self):
         input_embeds = self.embedding(self.indices).unsqueeze(0)
         intermediate_parallel, bias_parallel = self.first(input_embeds)
         intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 4d558ce00114..50c97e349885 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -23,6 +23,7 @@
         MCoreGPTEmbeddingMixin,
         MCoreMLPMixin,
         MCoreSelfAttentionMixin,
+        MCoreSequentialMLPMixin,
         MCoreTransformerLayerMixin,
     )
 except (ImportError, ModuleNotFoundError):
@@ -36,6 +37,8 @@
     LoraHto4HAdapterConfig,
     LoraKQVAdapterConfig,
     LoraKQVAdapterWeightTyingConfig,
+    LoraMoe4HtoHAdapterConfig,
+    LoraMoeHto4HAdapterConfig,
     LoraUnfusedHto4HAdapterConfig,
     LoraUnfusedKQVAdapterConfig,
     MLPInfusedAdapterConfig,
@@ -176,7 +179,10 @@ def __init__(self, cfg):
 
             elif module == PEFT_MODULE_MAP["hto4h_module"]:
                 hto4h_projection_size = cfg.ffn_hidden_size * 2 if fast_glu_activation else cfg.ffn_hidden_size
-                if lora_cfg.get("variant", "nemo") == "canonical":
+                if cfg.get('num_moe_experts', None):
+                    _adapter_name = AdapterName.LORA_MOE_Hto4H_ADAPTER
+                    _adapter_cfg_cls = LoraMoeHto4HAdapterConfig
+                elif lora_cfg.get("variant", "nemo") == "canonical":
                     _adapter_name = AdapterName.LORA_UNFUSED_Hto4H_ADAPTER
                     _adapter_cfg_cls = LoraUnfusedHto4HAdapterConfig
                 else:
@@ -187,13 +193,35 @@ def __init__(self, cfg):
                     cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, _adapter_cfg_cls
                 )
                 name_key_to_cfg[_adapter_name] = adapter_cfg
-                name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)]
+                if _adapter_name == AdapterName.LORA_MOE_Hto4H_ADAPTER:
+                    name_key_to_mcore_mixins[_adapter_name] = [("mlp.experts", MCoreSequentialMLPMixin)]
+                    for i in range(int(cfg.num_moe_experts)):
+                        name_key_to_mcore_mixins[_adapter_name].append(
+                            (f"mlp.experts.local_experts.{i}", MCoreMLPMixin)
+                        )
+                else:
+                    name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)]
+
             elif module == PEFT_MODULE_MAP["4htoh_module"]:
+                if cfg.get('num_moe_experts', None):
+                    _adapter_name = AdapterName.LORA_MOE_4HtoH_ADAPTER
+                    _adapter_cfg_cls = LoraMoe4HtoHAdapterConfig
+                else:
+                    _adapter_name = AdapterName.LORA_4HtoH_ADAPTER
+                    _adapter_cfg_cls = Lora4HtoHAdapterConfig
+
                 adapter_cfg = self._create_lora_config(
-                    cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, Lora4HtoHAdapterConfig
+                    cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, _adapter_cfg_cls
                 )
-                name_key_to_cfg[AdapterName.LORA_4HtoH_ADAPTER] = adapter_cfg
-                name_key_to_mcore_mixins[AdapterName.LORA_4HtoH_ADAPTER] = [("mlp", MCoreMLPMixin)]
+                name_key_to_cfg[_adapter_name] = adapter_cfg
+                if _adapter_name == AdapterName.LORA_MOE_4HtoH_ADAPTER:
+                    name_key_to_mcore_mixins[_adapter_name] = [("mlp.experts", MCoreSequentialMLPMixin)]
+                    for i in range(int(cfg.num_moe_experts)):
+                        name_key_to_mcore_mixins[_adapter_name].append(
+                            (f"mlp.experts.local_experts.{i}", MCoreMLPMixin)
+                        )
+                else:
+                    name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)]
             else:
                 logging.error(
                     f"Unrecognized target_module string: {module}.\n"
@@ -228,6 +256,8 @@ def _create_lora_config(
             assert kv_channels is not None, "kv_channels must be provided for canonical Lora"
             config_args.update({"num_query_groups": num_query_groups, "kv_channels": kv_channels})
             config_args.pop("out_features")
+        elif adapter_cfg_cls in (LoraMoeHto4HAdapterConfig, LoraMoe4HtoHAdapterConfig):
+            config_args.update({'num_moe_experts': cfg.num_moe_experts})
 
         if lora_cfg.weight_tying:
             position_embedding_strategy = lora_cfg.get("position_embedding_strategy", None)

From bbdcd20c5753a4995957493c2e0ba4c2fd12054f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 11 Jun 2024 22:16:42 +0200
Subject: [PATCH 07/25] ci: Enrich notifications (#9412)

* ci: Extract step output

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci: Enrich notifications

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci(notifications): Catch case multiple failures

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci(notifications): Logs to single line

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci(notifications): Infer job_url

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci(notifications): Make author and url clickable

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci(notifications): Extract the last 2K chars

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci(notifications): Update docs

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* ci(notifications): Disable b64 wrapping

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/scripts/slackHelper.sh       | 23 ----------
 .github/workflows/_test_template.yml | 39 +++++++++++++++-
 .github/workflows/cicd-main.yml      | 66 +++++++++++++++++++++++++---
 3 files changed, 98 insertions(+), 30 deletions(-)
 delete mode 100644 .github/scripts/slackHelper.sh

diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh
deleted file mode 100644
index 4696cebcf13b..000000000000
--- a/.github/scripts/slackHelper.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-function sendSlackMessage() {
-
-  WEBHOOK_URL="$1"
-  PIPELINE_URL="$2"
-
-  curl -X POST -H "Content-type: application/json" --data "{
-      \"blocks\": [
-        {
-			\"type\": \"section\",
-			\"text\": {
-				\"type\": \"mrkdwn\",
-				\"text\": \"\
-🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*:
-
-\"
-			}
-		}
-      ]
-    }" $WEBHOOK_URL
-
-}
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 31e9452d0fe5..065af34408cc 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -30,13 +30,16 @@ on:
       conclusion:
         description: Conclusion of main test step
         value: ${{ jobs.main.outputs.conclusion }}
-
+      log:
+        description: Last 2000 characters of the test step's log
+        value: ${{ jobs.main.outputs.log }} 
 jobs:
   main:
     runs-on: ${{ inputs.RUNNER }} 
     timeout-minutes: ${{ inputs.TIMEOUT }}
     outputs:
       conclusion: ${{ steps.main.conclusion }}
+      log: ${{ steps.main.outputs.log }}
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -50,7 +53,39 @@ jobs:
         - name: Checkout repository
           uses: actions/checkout@v4
         - id: main
-          run: ${{ inputs.SCRIPT }}
+          name: Run main script
+          run: |
+            set +e 
+            (  
+              set -e
+
+              ${{ inputs.SCRIPT }}
+            ) 2> >(tee err.log)
+
+            EXIT_CODE=$?
+            # Slack only allows 3000 chars per block.
+            # Since a block contains information about other
+            # metdata than the log, we prune the log to 2000
+            # chars.
+            min() {
+                if (( $1 > $2 )); then
+                    echo $2
+                else
+                    echo $1
+                fi
+            }
+
+            log=$(cat err.log)
+
+            MAX_LENGTH=$(echo $log | wc -m)
+            MAX_LENGTH=$(min $MAX_LENGTH 2000)
+            MAX_LENGTH=$(( $MAX_LENGTH - 1 ))
+
+            log=$(echo "${log: -${MAX_LENGTH}}" | base64 -w 0)
+            echo "log=$log" | tee -a "$GITHUB_OUTPUT"
+            
+            exit $EXIT_CODE
+            
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: failure() && inputs.IS_OPTIONAL == false
         - name: after_script
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 6cf60271e0d7..fab97d71f47a 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4284,12 +4284,68 @@ jobs:
       
       - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
         run: |
-          source .github/scripts/slackHelper.sh
-
-          WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }}
+          set -x
+
+          PR_INFO=$(curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/${{ github.repository }}/pulls/${{ github.event.number }}
+          )
+          PR_URL=$(echo -E $PR_INFO | jq '.html_url' | tr -d '"')
+          PR_TITLE=$(echo -E $PR_INFO | jq '.title' | tr -d '"')
+          
           PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-
-          sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL"
+          BASE_MESSAGE='
+            {
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "🚨 *CI/CD failure at <'$PIPELINE_URL'|NeMo CI>*."
+                  }
+                }
+              ]
+            }
+          '
+
+          JOBS_URL="https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs"  
+          SUMMARY="[]"
+          while IFS= read -r JOB; do
+            JOB_NAME="$(echo $JOB | jq '.key' | tr -d '"') / main"
+            JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" $JOBS_URL | jq --arg job_name "$JOB_NAME" -r '.jobs[] | select(.name == $job_name) | .id')
+            JOB_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}/job/$JOB_ID"
+
+            LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"')
+
+            SUMMARY=$(echo "$SUMMARY" | jq \
+              --arg pr "<$PR_URL|$PR_TITLE>" \
+              --arg job "<$JOB_URL|$JOB_NAME>" \
+              --arg logs "$LOGS" \
+              --arg author "<https://github.com/${{ github.actor }}|${{ github.actor }}>" \
+              --arg branch "<https://github.com/${{ github.repository }}/tree/${{ github.head_ref || github.ref_name }}|${{ github.head_ref || github.ref_name }}>"\
+              '. += [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": (
+                    "PR: " + $pr
+                    + "\nJob: " + $job
+                    + "\nAuthor: " + $author
+                    + "\nBranch: " + $branch
+                    + "\nLogs:" 
+                    + "```\n" + $logs + "\n```" 
+                  )
+                }
+              }
+            ]')
+          done <<<$(echo '${{ toJSON(needs) }}' | jq -c 'to_entries | .[] | select(.value.outputs.conclusion == "failure")')
+
+          MESSAGE=$(echo $BASE_MESSAGE | jq -c --argjson summary "$SUMMARY" '.blocks += $summary')
+
+          curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }}
 
       - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
         run: |

From 070e63dad6d70e3c231d44d810e29b63f9422a0c Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 11 Jun 2024 13:52:47 -0700
Subject: [PATCH 08/25] apply user's precision to output checkpoint (#9222)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../convert_mistral_7b_nemo_to_hf.py                     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
index 07e12f36c3d7..99d1795aea9c 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py
@@ -211,15 +211,18 @@ def convert(in_file, precision=None, cpu_only=True) -> None:
     else:
         output_layer_base_name = 'model.language_model.output_layer.weight'
     state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name])
-    return state_dict, nemo_config
+    return state_dict, nemo_config, dtype
 
 
 if __name__ == '__main__':
     args = get_args()
-    hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision)
+    hf_state_dict, nemo_config, dtype = convert(args.input_name_or_path, args.precision)
 
     config = load_config(args.hf_model_name, nemo_config)
-    model = AutoModelForCausalLM.from_config(config)
+    model = AutoModelForCausalLM.from_config(
+        config,
+        torch_dtype=dtype,
+    )
     model.load_state_dict(hf_state_dict)
     model.save_pretrained(args.output_path)
     hf_tokenizer = AutoTokenizer.from_pretrained(args.hf_model_name)

From 3c29fefe9ac442e594f1c35c0f8ecc09b5ef5015 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Tue, 11 Jun 2024 22:49:05 -0400
Subject: [PATCH 09/25] Fix failing RIR unit test with lhotse 1.24+ (#9444)

---
 .../common/test_lhotse_dataloading.py         | 144 ++++++++++++++----
 1 file changed, 117 insertions(+), 27 deletions(-)

diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 744e2884d015..111c00df392a 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -158,9 +158,10 @@ def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> Tuple[str, str]:
     root = nemo_manifest_path.parent / "nemo_tar"
     root.mkdir(exist_ok=True)
 
-    with TarWriter(f"{root}/audios_%01d.tar", shard_size=5) as tar_writer, SequentialJsonlWriter(
-        root / "tarred_audio_filepaths.jsonl"
-    ) as mft_writer:
+    with (
+        TarWriter(f"{root}/audios_%01d.tar", shard_size=5) as tar_writer,
+        SequentialJsonlWriter(root / "tarred_audio_filepaths.jsonl") as mft_writer,
+    ):
         for idx, d in enumerate(load_jsonl(nemo_manifest_path)):
             p = d["audio_filepath"]
             name = Path(p).name
@@ -856,7 +857,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path):
     from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
 
     # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV
-    INT16MAX = 2 ** 15
+    INT16MAX = 2**15
     expected_audio = np.random.randint(low=-INT16MAX - 1, high=INT16MAX, size=(16000,)).astype(np.float32) / INT16MAX
     audio_path = str(tmp_path / "dummy.wav")
     sf.write(audio_path, expected_audio, 16000)
@@ -904,7 +905,7 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path):
     from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
 
     # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV
-    INT16MAX = 2 ** 15
+    INT16MAX = 2**15
     expected_audio = np.random.randint(low=-INT16MAX - 1, high=INT16MAX, size=(16000,)).astype(np.float32) / INT16MAX
     audio_path = str(tmp_path / "dummy.wav")
     sf.write(audio_path, expected_audio, 16000)
@@ -950,7 +951,13 @@ def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path):
     CutSet([cut]).to_file(cuts_path)
 
     config = OmegaConf.create(
-        {"cuts_path": cuts_path, "sample_rate": 16000, "use_lhotse": True, "num_workers": 0, "batch_size": 2,}
+        {
+            "cuts_path": cuts_path,
+            "sample_rate": 16000,
+            "use_lhotse": True,
+            "num_workers": 0,
+            "batch_size": 2,
+        }
     )
 
     dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_Identity())
@@ -981,13 +988,21 @@ def test_extended_data_input_cfg(cutset_shar_path, nemo_tarred_manifest_path_mul
                     "manifest_filepath": nemo_tarred_manifest_path_multi[0],
                     "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1],
                     "weight": 0.5,
-                    "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",},
+                    "tags": {
+                        "language": "en",
+                        "modality": "audio",
+                        "dataset_name": "D1",
+                    },
                 },
                 {
                     "type": "lhotse_shar",
                     "shar_path": cutset_shar_path,
                     "weight": 0.5,
-                    "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",},
+                    "tags": {
+                        "language": "en",
+                        "modality": "audio",
+                        "dataset_name": "D2",
+                    },
                 },
             ],
             "sample_rate": 16000,
@@ -1031,17 +1046,27 @@ def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest
                             "manifest_filepath": nemo_tarred_manifest_path_multi[0],
                             "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1],
                             "weight": 0.5,
-                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",},
+                            "tags": {
+                                "language": "en",
+                                "modality": "audio",
+                                "dataset_name": "D1",
+                            },
                         },
                         {
                             "type": "lhotse_shar",
                             "shar_path": cutset_shar_path,
                             "weight": 0.5,
-                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",},
+                            "tags": {
+                                "language": "en",
+                                "modality": "audio",
+                                "dataset_name": "D2",
+                            },
                         },
                     ],
                     "weight": 0.2,
-                    "tags": {"group_name": "G1",},
+                    "tags": {
+                        "group_name": "G1",
+                    },
                 },
                 {
                     "type": "group",
@@ -1052,16 +1077,26 @@ def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest
                             "manifest_filepath": nemo_tarred_manifest_path_multi[0],
                             "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1],
                             "weight": 0.5,
-                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D3",},
+                            "tags": {
+                                "language": "en",
+                                "modality": "audio",
+                                "dataset_name": "D3",
+                            },
                         },
                         {
                             "type": "lhotse_shar",
                             "shar_path": cutset_shar_path,
                             "weight": 0.5,
-                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D4",},
+                            "tags": {
+                                "language": "en",
+                                "modality": "audio",
+                                "dataset_name": "D4",
+                            },
                         },
                     ],
-                    "tags": {"group_name": "G2",},
+                    "tags": {
+                        "group_name": "G2",
+                    },
                 },
             ],
             "sample_rate": 16000,
@@ -1107,13 +1142,21 @@ def test_extended_data_input_cfg_yaml_path(tmp_path, cutset_shar_path, nemo_tarr
             "manifest_filepath": str(nemo_tarred_manifest_path_multi[0]),
             "tarred_audio_filepaths": str(nemo_tarred_manifest_path_multi[1]),
             "weight": 0.5,
-            "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",},
+            "tags": {
+                "language": "en",
+                "modality": "audio",
+                "dataset_name": "D1",
+            },
         },
         {
             "type": "lhotse_shar",
             "shar_path": str(cutset_shar_path),
             "weight": 0.5,
-            "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",},
+            "tags": {
+                "language": "en",
+                "modality": "audio",
+                "dataset_name": "D2",
+            },
         },
     ]
 
@@ -1166,7 +1209,13 @@ def txt_es_path(tmp_path_factory):
 def test_text_file_input(txt_en_path, txt_es_path):
     config = OmegaConf.create(
         {
-            "input_cfg": [{"type": "txt", "paths": txt_en_path, "language": "en",},],
+            "input_cfg": [
+                {
+                    "type": "txt",
+                    "paths": txt_en_path,
+                    "language": "en",
+                },
+            ],
             "shuffle": True,
             "num_workers": 0,
             "batch_size": 4,
@@ -1312,13 +1361,17 @@ def test_multimodal_text_audio_dataloading(
                     "target_paths": es_paths,
                     "source_language": "en",
                     "target_language": "es",
-                    "tags": {"modality": "text",},
+                    "tags": {
+                        "modality": "text",
+                    },
                 },
                 {
                     "type": "nemo_tarred",
                     "manifest_filepath": manifest_filepath,
                     "tarred_audio_filepaths": tarred_audio_filepaths,
-                    "tags": {"modality": "audio",},
+                    "tags": {
+                        "modality": "audio",
+                    },
                 },
             ],
             "shuffle": True,
@@ -1339,7 +1392,11 @@ def test_multimodal_text_audio_dataloading(
     )
 
     dl = get_lhotse_dataloader_from_config(
-        config=config, global_rank=0, world_size=1, dataset=Identity(), tokenizer=en_es_tokenizer,
+        config=config,
+        global_rank=0,
+        world_size=1,
+        dataset=Identity(),
+        tokenizer=en_es_tokenizer,
     )
 
     # Note: we use islice here because the dataloader will be infinite.
@@ -1402,7 +1459,12 @@ def test_dataloader_with_noise_nemo_json(cutset_path: Path, nemo_manifest_path:
             "shard_seed": 0,
         }
     )
-    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    dl = get_lhotse_dataloader_from_config(
+        config=config,
+        global_rank=0,
+        world_size=1,
+        dataset=Identity(),
+    )
     batch = next(iter(dl))
     assert isinstance(batch, CutSet)
     assert len(batch) == 2
@@ -1426,7 +1488,12 @@ def test_dataloader_with_noise_lhotse_jsonl(cutset_path: Path):
             "shard_seed": 0,
         }
     )
-    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    dl = get_lhotse_dataloader_from_config(
+        config=config,
+        global_rank=0,
+        world_size=1,
+        dataset=Identity(),
+    )
     batch = next(iter(dl))
     assert isinstance(batch, CutSet)
     assert len(batch) == 2
@@ -1443,7 +1510,10 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_
     config = OmegaConf.create(
         {
             "cuts_path": str(cutset_path),
-            "noise_path": {"manifest_filepath": noise_json, "tarred_audio_filepaths": noise_tar,},
+            "noise_path": {
+                "manifest_filepath": noise_json,
+                "tarred_audio_filepaths": noise_tar,
+            },
             "noise_mix_prob": 1.0,
             "noise_snr": [-5.0, 5.0],
             "batch_size": 2,
@@ -1451,7 +1521,12 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_
             "shard_seed": 0,
         }
     )
-    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    dl = get_lhotse_dataloader_from_config(
+        config=config,
+        global_rank=0,
+        world_size=1,
+        dataset=Identity(),
+    )
     batch = next(iter(dl))
     assert isinstance(batch, CutSet)
     assert len(batch) == 2
@@ -1464,6 +1539,8 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_
 
 
 def test_dataloader_with_synth_rir(cutset_path: Path):
+    from lhotse.augmentation import ReverbWithImpulseResponse
+
     config = OmegaConf.create(
         {
             "cuts_path": str(cutset_path),
@@ -1474,7 +1551,12 @@ def test_dataloader_with_synth_rir(cutset_path: Path):
             "shard_seed": 0,
         }
     )
-    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),)
+    dl = get_lhotse_dataloader_from_config(
+        config=config,
+        global_rank=0,
+        world_size=1,
+        dataset=Identity(),
+    )
     batch = next(iter(dl))
     assert isinstance(batch, CutSet)
     assert len(batch) == 4
@@ -1487,8 +1569,16 @@ def test_dataloader_with_synth_rir(cutset_path: Path):
     cut = batch[2]
     assert isinstance(cut, MonoCut)
     assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1
-    assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse"
+    tfnm = cut.recording.transforms[0]
+    if isinstance(tfnm, dict):  # lhotse<=1.23.0
+        assert tfnm["name"] == "ReverbWithImpulseResponse"
+    else:  # lhotse>=1.24.0
+        assert isinstance(tfnm, ReverbWithImpulseResponse)
     cut = batch[3]
     assert isinstance(cut, MonoCut)
     assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1
-    assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse"
+    tfnm = cut.recording.transforms[0]
+    if isinstance(tfnm, dict):  # lhotse<=1.23.0
+        assert tfnm["name"] == "ReverbWithImpulseResponse"
+    else:  # lhotse>=1.24.0
+        assert isinstance(tfnm, ReverbWithImpulseResponse)

From 8e7e46052d12a27bd2c601240878c3406aba58b0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 12 Jun 2024 12:50:56 +0200
Subject: [PATCH 10/25] Add option for mutex timeout in distributed optimizer
 backward hook (#9087) (#9091)

* Tim: Add option for timeout in distopt callback mutex


* Replace parent's _lock


* Revert "Replace parent's _lock"

This reverts commit 972d1b60432009e729bd51ac3b2d989cb4368b82.


* Raise RuntimeError when timeout


* Change RuntimeError to print


---------

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <minitu77@gmail.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/core/optim/distributed_adam.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index 77d00de89232..716c905493e0 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import collections
+import contextlib
 import itertools
 from typing import Callable, Dict, Iterable, Optional, Union
 
@@ -108,6 +109,8 @@ class MegatronDistributedFusedAdam(DistributedFusedAdam):
             but requires larger memory than distributing within all
             ranks, especially for pure data parallel models.
             (default: False).
+        lock_timeout (float, optional): timeout for callback mutex in
+            seconds.
         **kwargs: keyword arguments to pass to Apex
             DistributedFusedAdam.
 
@@ -118,6 +121,7 @@ def __init__(
         params: Union[Iterable[torch.nn.Parameter], Iterable[dict]],
         disable_distributed_parameters: bool = False,
         distribute_within_nodes: bool = False,
+        lock_timeout: Optional[float] = None,
         **kwargs,
     ):
 
@@ -152,6 +156,25 @@ def __init__(
         # Construct distributed optimizer
         super().__init__(param_groups, **kwargs)
 
+        # Create mutex with timeout
+        self._lock_with_timeout = None
+        if lock_timeout is not None:
+
+            @contextlib.contextmanager
+            def lock_with_timeout():
+                result = self._lock.acquire(timeout=lock_timeout)
+                try:
+                    yield result
+                finally:
+                    if result:
+                        # Acquired lock before timeout
+                        self._lock.release()
+                    else:
+                        # Failed to acquire lock before timeout
+                        print(f'MegatronDistributedFusedAdam: Failed to acquire lock within {lock_timeout} seconds.')
+
+            self._lock_with_timeout = lock_with_timeout
+
     def _broadcast_params(self) -> None:
         # Assume params have already been synchronized
         pass
@@ -166,7 +189,10 @@ def hook(*unused):
                     'before the forward pass (e.g. by calling data_ptr) '
                     'or run DistributedFusedAdam with overlap_param_sync=False.'
                 )
-            with self._lock:
+            lock = self._lock
+            if self._lock_with_timeout is not None:
+                lock = self._lock_with_timeout()
+            with lock:
                 need_to_initialize = 'fragments' not in self.state[param]
                 if need_to_initialize:
                     self._init_param_state(param, param_group_id, param_id)

From 5f6ca08b91e3b249947ef1992d372304bfd7dc6f Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Wed, 12 Jun 2024 17:21:29 +0200
Subject: [PATCH 11/25] [NeMo-UX] Adding support for mcore distributed
 optimizer (#9435)

* Fixing mcore DDP wrapping

* Trying to add support for mcore

* Proposal how to support mcore's distributed optimizer

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Remove some un-used code

* Remove some un-used code

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Make design more robust

* Make design more robust

* Re-use getattr_proxy

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Add all-reduces to MegatronOptim

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Remove optimizer_fn from GPTConfig

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Trying to fix failing megatron_parallel tests

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/base.py    | 24 ++++---
 nemo/lightning/megatron_parallel.py       | 77 ++++++++++++++---------
 nemo/lightning/optim.py                   | 66 +++++++++++++++++++
 nemo/lightning/pytorch/strategies.py      | 34 ++++++----
 tests/lightning/test_megatron_parallel.py |  3 +-
 5 files changed, 152 insertions(+), 52 deletions(-)
 create mode 100644 nemo/lightning/optim.py

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 9bf710d98928..9f5c23493d03 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -1,15 +1,18 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional
+from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
 
 import pytorch_lightning as L
 import torch
 import torch.distributed
 from megatron.core.transformer.transformer_config import TransformerConfig
+from pytorch_lightning.utilities.types import OptimizerLRScheduler
+from torch import nn
 from torch.optim import Optimizer
 
 from nemo.collections.llm import fn
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
+from nemo.lightning.optim import MegatronOptim, OptimizerConfig
 
 if TYPE_CHECKING:
     from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
@@ -33,8 +36,6 @@ class GPTConfig(TransformerConfig):
     # TODO: Move this to better places?
     get_attention_mask_from_fusion: bool = False
 
-    optimizer_fn: Optional[Callable[["GPTModel"], Optimizer]] = None
-
     def configure_model(self, tokenizer) -> "MCoreGPTModel":
         vp_size = self.virtual_pipeline_model_parallel_size
         if vp_size:
@@ -69,20 +70,19 @@ def __init__(
         self,
         config: GPTConfig,
         # TODO: Add transformer_layer_spec when we update mcore
+        optim: Optional[Union[MegatronOptim, Callable[[nn.Module], OptimizerLRScheduler]]] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
     ):
         super().__init__()
         self.config = config
         self.tokenizer = tokenizer
+        self.optim = optim or MegatronOptim(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
 
     def configure_model(self) -> None:
         self.module = self.config.configure_model(self.tokenizer)
 
-    def configure_optimizers(self) -> Optimizer:
-        if self.config.optimizer_fn is not None:
-            return self.config.optimizer_fn(self)
-
-        return gpt_default_optimizer(self)
+    def configure_optimizers(self, megatron_parallel=None):
+        return self.optim(megatron_parallel or self)
 
     def forward(
         self,
@@ -172,9 +172,13 @@ def gpt_forward_step(model, batch) -> torch.Tensor:
 
 
 def gpt_default_optimizer(module) -> Optimizer:
-    from apex.optimizers import FusedAdam
+    # from apex.optimizers import FusedAdam
+
+    from megatron.core.optimizer import OptimizerConfig
+
+    return OptimizerConfig(lr=1e-4)
 
-    return FusedAdam(module.parameters(), lr=1e-4)
+    # return FusedAdam(module.parameters(), lr=1e-4)
 
 
 def get_batch_on_this_context_parallel_rank(batch):
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index d23e57941aaf..12a9da97c342 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -3,6 +3,7 @@
 import functools
 import inspect
 import queue
+import types
 from collections import defaultdict
 from typing import (
     Any,
@@ -24,6 +25,7 @@
 
 import torch
 import torch.distributed
+from megatron.core.distributed import DistributedDataParallel as McoreDDP
 from megatron.core.distributed import DistributedDataParallelConfig
 from torch import Tensor, nn
 
@@ -132,37 +134,37 @@ def __init__(
                         _model.configure_model()
                     _pipeline.append(_model)
 
-            if isinstance(ddp_config, DistributedDataParallelConfig):
-                from megatron.core.distributed import DistributedDataParallel as McoreDDP
-
-                _pipeline = [
-                    McoreDDP(
-                        model_chunk.config,
-                        ddp_config,
-                        model_chunk,
-                        data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
-                        expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
-                        # Turn off bucketing for model_chunk 2 onwards, since communication for these
-                        # model chunks is overlapped with compute anyway.
-                        disable_bucketing=(model_chunk_idx > 0),
-                    )
-                    for (model_chunk_idx, model_chunk) in enumerate(_pipeline)
-                ]
+        if isinstance(ddp_config, DistributedDataParallelConfig):
+            for model_chunk_idx, model_chunk in enumerate(_pipeline):
+                module = model_chunk.module
+                ddp = DDP(
+                    module.config,
+                    ddp_config,
+                    module,
+                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                    # model chunks is overlapped with compute anyway.
+                    disable_bucketing=(model_chunk_idx > 0),
+                )
+                model_chunk.module = ddp
+                model_chunk.buffers = ddp.buffers  # We need to do this explicitly since this is a attr pytorch uses
+                model_chunk.__class__.__getattr__ = getattr_proxy  # type: ignore
 
-            for i, model_module in enumerate(_pipeline):
-                if not cpu:
-                    model_module.cuda(torch.cuda.current_device())
+        for i, model_module in enumerate(_pipeline):
+            if not cpu:
+                model_module.cuda(torch.cuda.current_device())
 
-                for param in model_module.parameters():
-                    set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+            for param in model_module.parameters():
+                set_defaults_if_not_set_tensor_model_parallel_attributes(param)
 
-                if hasattr(model_module, "configure_model"):
-                    if not hasattr(model_module, "set_input_tensor"):
-                        if hasattr(model_module.module, "set_input_tensor"):
-                            model_module.set_input_tensor = model_module.module.set_input_tensor
-                        else:
-                            # TODO: What to do here?
-                            pass
+            if hasattr(model_module, "configure_model"):
+                if not hasattr(model_module, "set_input_tensor"):
+                    if hasattr(model_module.module, "set_input_tensor"):
+                        model_module.set_input_tensor = model_module.module.set_input_tensor
+                    else:
+                        # TODO: What to do here?
+                        pass
 
             # Print number of parameters.
             if parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0:
@@ -536,6 +538,7 @@ def __init__(self, name: str, is_property: bool = False, includes_self: bool = F
         self.includes_self = includes_self
 
     def __call__(self, module: nn.Module):
+
         attr = getattr(module, self.name)
 
         if self.is_property:
@@ -554,6 +557,24 @@ def wrapped(self, *args):
         return attr
 
 
+def getattr_proxy(self, item: Any) -> Any:
+    try:
+        return super(self.__class__, self).__getattr__(item)
+    except AttributeError:
+        try:
+            return getattr(self.module, item)
+        except AttributeError:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'")
+
+
+class DDP(McoreDDP):
+    def state_dict(self, prefix='', keep_vars=False, **kwargs):
+        self.module.state_dict(prefix=prefix, keep_vars=keep_vars, **kwargs)
+
+    def __getattr__(self, item: Any) -> Any:
+        return getattr_proxy(self, item)
+
+
 class CallbackConnector:
     """
     A connector for managing and invoking callbacks.
diff --git a/nemo/lightning/optim.py b/nemo/lightning/optim.py
new file mode 100644
index 000000000000..d706680776bc
--- /dev/null
+++ b/nemo/lightning/optim.py
@@ -0,0 +1,66 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Optional
+
+from megatron.core.distributed import finalize_model_grads
+from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+from megatron.core.utils import get_model_config
+from pytorch_lightning.utilities.types import OptimizerLRScheduler
+from torch.optim import Optimizer
+
+if TYPE_CHECKING:
+    from nemo.lightning.megatron_parallel import MegatronParallel
+
+
+@dataclass
+class MegatronOptim:
+    config: OptimizerConfig
+    finalize_model_grads: Callable = finalize_model_grads
+
+    def create_optimizer(
+        self,
+        megatron_parallel: "MegatronParallel",
+        no_weight_decay_cond: Optional[Callable] = None,
+        scale_lr_cond: Optional[Callable] = None,
+        lr_mult: float = 1.0,
+    ) -> Optimizer:
+        from nemo.core.optim import McoreDistributedOptimizer
+
+        # TODO: Where should we put this?
+        get_model_config(megatron_parallel[0]).finalize_model_grads = finalize_model_grads
+
+        mcore_opt = get_megatron_optimizer(
+            self.config,
+            list(megatron_parallel),
+            no_weight_decay_cond=no_weight_decay_cond,
+            scale_lr_cond=scale_lr_cond,
+            lr_mult=lr_mult,
+        )
+
+        return McoreDistributedOptimizer(mcore_opt)
+
+    def configure_optimizer(self, megatron_parallel: "MegatronParallel") -> OptimizerLRScheduler:
+        from nemo.core.optim.lr_scheduler import CosineAnnealing
+
+        opt = self.create_optimizer(megatron_parallel)
+
+        # TODO: Make this configurable through the dataclass
+        lr_scheduler = CosineAnnealing(opt, max_steps=10, warmup_steps=750, constant_steps=80000, min_lr=int(6e-5))
+
+        return {
+            "optimizer": opt,
+            # REQUIRED: The scheduler instance
+            "scheduler": lr_scheduler,
+            # The unit of the scheduler's step size, could also be 'step'.
+            # 'epoch' updates the scheduler on epoch end whereas 'step'
+            # updates it after a optimizer update.
+            "interval": "epoch",
+            # How many epochs/steps should pass between calls to
+            # `scheduler.step()`. 1 corresponds to updating the learning
+            # rate after every epoch/step.
+            "frequency": 1,
+            # Metric to to monitor for schedulers like `ReduceLROnPlateau`
+            "monitor": "val_loss",
+        }
+
+    def __call__(self, megatron_parallel: "MegatronParallel") -> OptimizerLRScheduler:
+        return self.configure_optimizer(megatron_parallel)
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 8fa178d7df01..7daef032376b 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -1,4 +1,5 @@
 import functools
+import inspect
 import logging
 import shutil
 from collections import OrderedDict
@@ -90,7 +91,7 @@ def __init__(
         self.ckpt_include_optimizer = ckpt_include_optimizer
 
         if ddp == "megatron":
-            self.ddp_config = DistributedDataParallelConfig()
+            self.ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True)
         elif isinstance(ddp, DistributedDataParallelConfig):
             self.ddp_config = ddp
         elif ddp == "pytorch":
@@ -165,18 +166,6 @@ def setup(self, trainer: pl.Trainer) -> None:
 
             trainer.fit_loop.epoch_loop.automatic_optimization = _MegatronAutomaticOptimization(trainer)
 
-            # set up optimizers after the wrapped module has been moved to the device
-            self.setup_optimizers(trainer)
-
-            # TODO: Throw an execption if we have a mcore optimizer and no ddp_config
-
-            if hasattr(self.precision_plugin, "convert_optimizer"):
-                _optimizers = [*self.optimizers]
-                _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0])
-                self.optimizers = _optimizers
-
-            _optimizers_to_device(self.optimizers, self.root_device)
-
             import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
 
             if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState):
@@ -223,6 +212,25 @@ def setup_megatron_parallel(self, trainer: pl.Trainer) -> None:
             cpu=isinstance(trainer.accelerator, CPUAccelerator),
             ddp_config=self.ddp_config,
         )
+
+        # check signature-def of self.model.configure_optimizers to check if there's an optional arg: megatron_parallel
+        sig = inspect.signature(self.model.configure_optimizers)
+        if "megatron_parallel" in sig.parameters:
+            self.model.configure_optimizers = functools.partial(
+                self.model.configure_optimizers, megatron_parallel=self.megatron_parallel
+            )
+
+        self.setup_optimizers(trainer)
+
+        # TODO: Throw an execption if we have a mcore optimizer and no ddp_config
+
+        if hasattr(self.precision_plugin, "convert_optimizer"):
+            _optimizers = [*self.optimizers]
+            _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0])
+            self.optimizers = _optimizers
+
+        _optimizers_to_device(self.optimizers, self.root_device)
+
         self.model = self.megatron_parallel
         self.model.trainer = trainer
 
diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py
index 31d20170c0b6..fafd25e49f5a 100644
--- a/tests/lightning/test_megatron_parallel.py
+++ b/tests/lightning/test_megatron_parallel.py
@@ -55,7 +55,7 @@ def test_init_with_defaults(self, mocker, mock_pipeline):
         mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1)
         mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False)
 
-        megatron_parallel = mp.MegatronParallel(pipeline=mock_pipeline)
+        megatron_parallel = mp.MegatronParallel(pipeline=mock_pipeline, cpu=True)
 
         assert megatron_parallel.pipeline == mock_pipeline
         assert megatron_parallel.precision_plugin is None
@@ -85,6 +85,7 @@ def test_init_with_custom_parameters(
             data_step=mock_data_step,
             forward_step=mock_forward_step,
             loss_reduction=mock_loss_reduction,
+            cpu=True,
         )
 
         assert megatron_parallel.pipeline == mock_pipeline

From 290456fba9cc2ca2c5a12a3ec9033792010aa206 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Wed, 12 Jun 2024 17:37:44 +0200
Subject: [PATCH 12/25] Use ModelOpt build_tensorrt_llm for building engines
 for qnemo checkpoints (#9452)

* Enable specyfing alpha for SQ

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable specifying use_custom_all_reduce for export

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Use native TRT-LLM param names in export (partial)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Detect TRT-LLM checkpoint programatically

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Pass use_custom_all_reduce in test_nemo_export.py

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Paramter parsing bugfix

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Revert "Paramter parsing bugfix"

This reverts commit b0a4dd3859eec5258b3091daad27c292979a154f.

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Revert "Enable specifying use_custom_all_reduce for export"

This reverts commit 9e419e3587a8b5c1eb8deda843ba37ee0fb1cf0d.

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Revert "Pass use_custom_all_reduce in test_nemo_export.py"

This reverts commit be7081248b6d31a389e79438cdbe8737c51803ee.

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Rename checkpoint detection function

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Use ModelOpt build_tensorrt_llm utility for qnemo for performance alignment

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Import fix

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py                   | 13 ++-
 .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py    | 92 +++++++++----------
 nemo/export/trt_llm/qnemo/utils.py            | 18 ++++
 3 files changed, 76 insertions(+), 47 deletions(-)
 create mode 100644 nemo/export/trt_llm/qnemo/utils.py

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index c826848e9328..6ad9d57a2ab8 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -33,6 +33,7 @@
 from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import get_tokenzier, is_nemo_file, load_nemo_model
 from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
 from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
+from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint
 from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
 from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load
 
@@ -229,7 +230,7 @@ def export(
             tmp_dir = tempfile.TemporaryDirectory()
             nemo_export_dir = Path(tmp_dir.name)
 
-            if nemo_checkpoint_path.endswith("qnemo"):
+            if is_qnemo_checkpoint(nemo_checkpoint_path):
                 if os.path.isdir(nemo_checkpoint_path):
                     nemo_export_dir = nemo_checkpoint_path
                 else:
@@ -244,7 +245,17 @@ def export(
                     max_output_len=max_output_len,
                     max_batch_size=max_batch_size,
                     max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                    tensor_parallel_size=tensor_parallel_size,
+                    pipeline_parallel_size=pipeline_parallel_size,
+                    use_parallel_embedding=use_parallel_embedding,
+                    paged_kv_cache=paged_kv_cache,
+                    remove_input_padding=remove_input_padding,
+                    enable_multi_block_mode=enable_multi_block_mode,
+                    use_lora_plugin=use_lora_plugin,
                     lora_target_modules=lora_target_modules,
+                    max_lora_rank=max_lora_rank,
+                    max_num_tokens=max_num_tokens,
+                    opt_num_tokens=opt_num_tokens,
                 )
             else:
                 model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
index b7e2f7bc2973..630330381e56 100644
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import os
-import subprocess
 
+import glob
+import os
+import warnings
 from typing import List, Optional
 
-CONFIG_NAME = "config.json"
+from modelopt.deploy.llm import build_tensorrt_llm
+
+from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME
 
 
 def qnemo_to_tensorrt_llm(
@@ -28,50 +30,48 @@ def qnemo_to_tensorrt_llm(
     max_output_len: int,
     max_batch_size: int,
     max_prompt_embedding_table_size: int,
+    tensor_parallel_size: int = None,
+    pipeline_parallel_size: int = None,
+    use_parallel_embedding: bool = False,
+    paged_kv_cache: bool = True,
+    remove_input_padding: bool = True,
+    enable_multi_block_mode: bool = False,
+    use_lora_plugin: str = None,
     lora_target_modules: Optional[List[str]] = None,
+    max_lora_rank: int = 64,
+    max_num_tokens: int = None,
+    opt_num_tokens: int = None,
 ):
-    """Build TRT-LLM engine via trtllm-build CLI API in a subprocess."""
+    """Build TensorRT-LLM engine with ModelOpt build_tensorrt_llm function."""
     assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
-    print(
-        "Note that setting n_gpus, tensor_parallel_size and pipeline_parallel_size parameters"
-        " for quantized models is possible only on export step via nemo.export.quantize module."
-        " These parameters are ignored when building and running TensorRT-LLM engine below."
+
+    warnings.warn(
+        "Note that setting tensor_parallel_size and pipeline_parallel_size parameters"
+        " for quantized models should be done on calibration step with nemo.export.quantize module."
+        " These parameters are ignored when building and running TensorRT-LLM engine below.",
+        UserWarning,
+        stacklevel=3,
     )
-    # Load config to explicitly pass selected parameters to trtllm-build command:
-    with open(os.path.join(nemo_checkpoint_path, CONFIG_NAME), "r") as f:
-        model_config = json.load(f)
-    command = [
-        "trtllm-build",
-        "--checkpoint_dir",
-        nemo_checkpoint_path,
-        "--output_dir",
-        engine_dir,
-        "--max_batch_size",
-        str(max_batch_size),
-        "--max_input_len",
-        str(max_input_len),
-        "--max_output_len",
-        str(max_output_len),
-        "--max_prompt_embedding_table_size",
-        str(max_prompt_embedding_table_size),
-        "--gemm_plugin",
-        model_config["dtype"],
-        "--gpt_attention_plugin",
-        model_config["dtype"],
-        "--strongly_typed",
-        "--use_custom_all_reduce",
-        "disable",
-        "--workers",
-        str(model_config["mapping"]["world_size"]),
-    ]
-    command_str = " ".join(command)
-    print(f"Build command is:\n{command_str}")
-    print("Running trtllm-build, this may take a while...")
-    result = subprocess.run(command, capture_output=True)  # TODO: consider streaming logs
-    if result.returncode != 0:
-        print(result.stdout.decode())
-        print(result.stderr.decode())
-        raise RuntimeError("Error encountered for trtllm-build command, please check logs.")
 
-    print("Building engine done. Full logs are:")
-    print(result.stdout.decode())
+    warnings.warn(
+        "Also use_parallel_embedding, paged_kv_cache, remove_input_padding, enable_multi_block_mode, max_num_tokens"
+        " and opt_num_tokens parameters are set by ModelOpt build_tensorrt_llm function in the optimal way and are"
+        " ignored on engine build step.",
+        UserWarning,
+        stacklevel=3,
+    )
+
+    num_build_workers = len(glob.glob(os.path.join(nemo_checkpoint_path, WEIGHTS_NAME.format("*"))))
+    assert num_build_workers, f"No TensorRT-LLM weight files found in {nemo_checkpoint_path}"
+
+    build_tensorrt_llm(
+        pretrained_config=os.path.join(nemo_checkpoint_path, CONFIG_NAME),
+        engine_dir=engine_dir,
+        max_input_len=max_input_len,
+        max_output_len=max_output_len,
+        max_batch_size=max_batch_size,
+        max_beam_width=1,
+        num_build_workers=num_build_workers,
+        enable_sparsity=False,
+        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+    )
diff --git a/nemo/export/trt_llm/qnemo/utils.py b/nemo/export/trt_llm/qnemo/utils.py
new file mode 100644
index 000000000000..58d1d308507f
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/utils.py
@@ -0,0 +1,18 @@
+import os
+from pathlib import Path
+
+from nemo.export.tarutils import TarPath
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "rank{}.safetensors"
+
+
+def is_qnemo_checkpoint(path: str) -> bool:
+    """Detect if a given path is a TensorRT-LLM a.k.a. "qnemo" checkpoint based on config & tensor data presence."""
+    if os.path.isdir(path):
+        path = Path(path)
+    else:
+        path = TarPath(path)
+    config_path = path / CONFIG_NAME
+    tensor_path = path / WEIGHTS_NAME.format(0)
+    return config_path.exists() and tensor_path.exists()

From 1c0bef011eb5b58a6fae76f1ae60cc94bf9b0bbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Wed, 12 Jun 2024 18:36:15 +0200
Subject: [PATCH 13/25] ci: Fix extract last 2K chars of logs (#9450)

ci(notifications): Fix extract of last 2K chars

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/_test_template.yml | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 065af34408cc..5956a23bdd67 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -63,26 +63,8 @@ jobs:
             ) 2> >(tee err.log)
 
             EXIT_CODE=$?
-            # Slack only allows 3000 chars per block.
-            # Since a block contains information about other
-            # metdata than the log, we prune the log to 2000
-            # chars.
-            min() {
-                if (( $1 > $2 )); then
-                    echo $2
-                else
-                    echo $1
-                fi
-            }
-
-            log=$(cat err.log)
-
-            MAX_LENGTH=$(echo $log | wc -m)
-            MAX_LENGTH=$(min $MAX_LENGTH 2000)
-            MAX_LENGTH=$(( $MAX_LENGTH - 1 ))
-
-            log=$(echo "${log: -${MAX_LENGTH}}" | base64 -w 0)
-            echo "log=$log" | tee -a "$GITHUB_OUTPUT"
+            
+            echo "log=$(tail -c 2000 err.log |  base64 -w 0)" >> "$GITHUB_OUTPUT"
             
             exit $EXIT_CODE
             

From f8eeb794c381f479bb3b245aac81415660549a6d Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Wed, 12 Jun 2024 14:26:08 -0700
Subject: [PATCH 14/25] Add option to merge distributed optimizer buckets
 (#9414)

* Add option to merge distopt buckets in GPT

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Move distopt bucket merge logic to base LLM class

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: timmoon10 <timmoon10@users.noreply.github.com>

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Signed-off-by: timmoon10 <timmoon10@users.noreply.github.com>
Co-authored-by: timmoon10 <timmoon10@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
---
 .../models/language_modeling/megatron_base_model.py    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index e7f2aa805a9c..0828d88a8133 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -861,7 +861,15 @@ def configure_optimizers(self):
 
             # Initialize param buckets if explicitly provided
             if getattr(self, 'distributed_adam_buckets', None) is not None:
-                for bucket in self.distributed_adam_buckets:
+                buckets = self.distributed_adam_buckets
+                if self.cfg.get('distributed_adam_bucket_merge_size', 1) > 1:
+                    # Merge buckets if needed
+                    stride = self.cfg.get('distributed_adam_bucket_merge_size', 1)
+                    buckets = [
+                        list(itertools.chain.from_iterable(buckets[i : i + stride]))
+                        for i in range(0, len(buckets), stride)
+                    ]
+                for bucket in buckets:
                     self._optimizer.init_params_bucket(bucket)
                 self._optimizer.init_params_bucket(self.parameters())
             if hasattr(self, 'distributed_adam_buckets'):

From 387f0b138d91da8996d982b8831ccf7370814ad1 Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Wed, 12 Jun 2024 17:01:33 -0600
Subject: [PATCH 15/25] Update readme with mlperf news (#9457)

* update

Signed-off-by: eharper <eharper@nvidia.com>

* update

Signed-off-by: eharper <eharper@nvidia.com>

* remove link to image

Signed-off-by: eharper <eharper@nvidia.com>

* remove link to image

Signed-off-by: eharper <eharper@nvidia.com>

* fix formatting

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
---
 README.rst | 122 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 88 insertions(+), 34 deletions(-)

diff --git a/README.rst b/README.rst
index c4cbf759d975..ab3a4b6b06c9 100644
--- a/README.rst
+++ b/README.rst
@@ -45,58 +45,112 @@ Latest News
 
   <details open>
     <summary><b>Large Language Models and Multimodal</b></summary>
-        <details>
-          <summary><a href="https://cloud.google.com/blog/products/compute/gke-and-nvidia-nemo-framework-to-train-generative-ai-models">Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE</a> (2024/03/16) </summary>
+      <details>
+        <summary>
+          <a href="https://developer.nvidia.com/blog/nvidia-sets-new-generative-ai-performance-and-scale-records-in-mlperf-training-v4-0/">
+            NVIDIA sets new generative AI performance and scale records in MLPerf Training v4.0
+          </a> (2024/06/12)
+        </summary>
+
+        Using NVIDIA NeMo Framework and NVIDIA Hopper GPUs NVIDIA was able to scale to 11,616 H100 GPUs and achieve near-linear performance scaling on LLM pretraining. 
+        NVIDIA also achieved the highest LLM fine-tuning performance and raised the bar for text-to-image training.
+        <br><br>
+      </details>
 
-          An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.
+      <details>
+          <summary>
+            <a href="https://cloud.google.com/blog/products/compute/gke-and-nvidia-nemo-framework-to-train-generative-ai-models">
+              Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE
+            </a> (2024/03/16)
+          </summary>
+
+          An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. 
+          The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.
           <br><br>
         </details>
 
       <details>
-        <summary><a href="https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/">Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso</a> (2024/03/06) </summary>
-
-        Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
+        <summary>
+          <a href="https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/">
+            Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso
+          </a> (2024/03/06)
+        </summary>
+
+        Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. 
+        The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. 
+        Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
         <br><br>
-    </details>
-
-    <details>
-      <summary><a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/">New NVIDIA NeMo Framework Features and NVIDIA H200</a> (2023/12/06) </summary>
+      </details>
 
-      NVIDIA NeMo Framework now includes several optimizations and enhancements, including: 1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
-      <br><br>
-      <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility"><img src="https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png" alt="H200-NeMo-performance" style="width: 600px;"></a>
-      <br><br>
-    </details>
-
-    <details>
-      <summary><a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">NVIDIA now powers training for Amazon Titan Foundation models</a> (2023/11/28) </summary>
+      <details>
+        <summary>
+          <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/">
+            New NVIDIA NeMo Framework Features and NVIDIA H200
+          </a> (2023/12/06)
+        </summary>
+
+        NVIDIA NeMo Framework now includes several optimizations and enhancements, 
+        including: 
+        1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 
+        2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 
+        3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 
+        4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
+        <br><br>
+        <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility">
+        <img src="https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png" alt="H200-NeMo-performance" style="width: 600px;"></a>
+        <br><br>
+      </details>
 
-      NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
-      <br><br>
-    </details>
+      <details>
+        <summary>
+          <a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">
+            NVIDIA now powers training for Amazon Titan Foundation models
+          </a> (2023/11/28)
+        </summary>
+
+        NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). 
+        The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. 
+        The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
+        <br><br>
+      </details>
 
   </details>
 
   <details open>
     <summary><b>Speech Recognition</b></summary>
-        <details>
-          <summary><a href="https://developer.nvidia.com/blog/new-standard-for-speech-recognition-and-translation-from-the-nvidia-nemo-canary-model/">New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model</a> (2024/04/18) </summary>
-
-          The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. Canary also provides bi-directional translation, between English and the three other supported languages.
-          <br><br>
-        </details>
-
       <details>
-        <summary><a href="https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/">Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models</a> (2024/04/18) </summary>
+        <summary>
+          <a href="https://developer.nvidia.com/blog/new-standard-for-speech-recognition-and-translation-from-the-nvidia-nemo-canary-model/">
+            New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model
+          </a> (2024/04/18)
+        </summary>
+
+        The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. 
+        Canary also provides bi-directional translation, between English and the three other supported languages.
+        <br><br>
+      </details>
 
-        NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.
+      <details>
+        <summary>
+          <a href="https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/">
+            Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models
+          </a> (2024/04/18)
+        </summary>
+
+        NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. 
+        These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.
         <br><br>
-    </details>
+      </details>
 
     <details>
-      <summary><a href="https://developer.nvidia.com/blog/turbocharge-asr-accuracy-and-speed-with-nvidia-nemo-parakeet-tdt/">Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT</a> (2024/04/18) </summary>
-
-      NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.
+      <summary>
+        <a href="https://developer.nvidia.com/blog/turbocharge-asr-accuracy-and-speed-with-nvidia-nemo-parakeet-tdt/">
+          Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT
+        </a> (2024/04/18)
+      </summary>
+
+      NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. 
+      This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.
       <br><br>
     </details>
 

From a72a0e790703c8eced7d95afc0e57dda244b733b Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Wed, 12 Jun 2024 22:22:33 -0400
Subject: [PATCH 16/25] TRT-LLM 0.10 Update (#9402)

* reorg the export code

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* replaced log with raise

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add converter and loader folders

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* move nemo_ckpt_convert into the converter folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* move nemo_file into loader folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* reorg converter

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* continue to reorg converter

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* continue to reorg

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* move nemo file back into nemo folder

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* renamed nemo folder to nemo_ckpt_loader

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove unused function

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* removed nemo file

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* moved a function to tensorrt_llm_run file

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* Remove unused imports

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* import csv added

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* update the APIs

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add use_embedding_sharing param

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* do not add unused inputs during MG export

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* add cpp runtime test

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* sharing embedding

* Remove manually scaling

* renaming to avoid nemo github issue

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
---
 nemo/export/tensorrt_llm.py                   | 10 +++-
 .../trt_llm/converter/model_converter.py      | 36 +++++++++---
 .../converter/model_to_trt_llm_ckpt.py        |  6 --
 nemo/export/trt_llm/tensorrt_llm_build.py     |  4 +-
 .../{test_nemo_export.py => nemo_export.py}   | 38 ++++++++++++
 tests/export/run.sh                           | 58 +++++++++----------
 6 files changed, 106 insertions(+), 46 deletions(-)
 rename tests/export/{test_nemo_export.py => nemo_export.py} (94%)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 6ad9d57a2ab8..7cc92f0ca588 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -121,6 +121,7 @@ def export(
         n_gpus: int = 1,
         tensor_parallel_size: int = None,
         pipeline_parallel_size: int = None,
+        gpus_per_node: int = None,
         max_input_len: int = 256,
         max_output_len: int = 256,
         max_input_token: Optional[int] = None,
@@ -128,6 +129,7 @@ def export(
         max_batch_size: int = 8,
         max_prompt_embedding_table_size=None,
         use_parallel_embedding: bool = False,
+        use_embedding_sharing: bool = False,
         paged_kv_cache: bool = True,
         remove_input_padding: bool = True,
         dtype: str = "bfloat16",
@@ -150,6 +152,7 @@ def export(
             n_gpus (int): number of GPUs to use for inference.
             tensor_parallel_size (int): tensor parallelism.
             pipeline_parallel_size (int): pipeline parallelism.
+            gpus_per_node (int): number of gpus per node.
             max_input_len (int): max input length.
             max_output_len (int): max output length.
             max_input_token (int): max input length. Deprecated, use max_input_len instead.
@@ -157,6 +160,7 @@ def export(
             max_batch_size (int): max batch size.
             max_prompt_embedding_table_size (int): max prompt embedding size.
             use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
+            use_embedding_sharing (bool):
             paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
             remove_input_padding (bool): enables removing input padding or not.
             dtype (str): Floating point type for model weights (Supports BFloat16/Float16).
@@ -173,7 +177,7 @@ def export(
         if model_type not in self.get_supported_models_list:
             raise Exception(
                 "Model {0} is not currently a supported model type. "
-                "Supported model types are llama, gptnext, falcon, and starcoder".format(model_type)
+                "Supported model types are llama, gptnext, falcon, and starcoder.".format(model_type)
             )
 
         if model_type == "gpt" or model_type == "starcoder":
@@ -189,6 +193,8 @@ def export(
             tensor_parallel_size = 1
             pipeline_parallel_size = n_gpus
 
+        gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node
+
         if Path(self.model_dir).exists():
             if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
                 for files in os.listdir(self.model_dir):
@@ -267,7 +273,9 @@ def export(
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     pipeline_parallel_size=pipeline_parallel_size,
+                    gpus_per_node=gpus_per_node,
                     use_parallel_embedding=use_parallel_embedding,
+                    use_embedding_sharing=use_embedding_sharing,
                 )
 
                 for weight_dict, model_config in zip(weights_dicts, model_configs):
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index 5e522d8bbff2..da13449160f9 100644
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -72,9 +72,17 @@ def model_to_trtllm_ckpt(
     dtype: str = "bfloat16",
     tensor_parallel_size: int = 1,
     pipeline_parallel_size: int = 1,
+    gpus_per_node: int = None,
     use_parallel_embedding: bool = False,
+    use_embedding_sharing: bool = False,
 ) -> Tuple[List[Dict], List[PretrainedConfig]]:
 
+    if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing:
+        LOGGER.info(
+            "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True"
+        )
+        use_embedding_sharing = True
+
     weights_dict = convert_model_to_trt_llm_ckpt(
         model=model,
         nemo_model_config=nemo_model_config,
@@ -88,12 +96,14 @@ def model_to_trtllm_ckpt(
 
     world_size = tensor_parallel_size * pipeline_parallel_size
 
-    lm_head_weight = weights_dict["lm_head.weight"]
+    has_lm_head = "lm_head.weight" in weights_dict
+    if has_lm_head:
+        lm_head_weight = weights_dict["lm_head.weight"]
 
     vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0]
-    vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size)
+    vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size
 
-    if vocab_size_padded != vocab_size:
+    if has_lm_head and vocab_size_padded != vocab_size:
         pad_width = vocab_size_padded - vocab_size
         lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
 
@@ -120,7 +130,7 @@ def model_to_trtllm_ckpt(
         'hidden_act': hidden_act,
         'use_parallel_embedding': use_parallel_embedding,
         'embedding_sharding_dim': 0,
-        'share_embedding_table': False,
+        'share_embedding_table': use_embedding_sharing,
         'quantization': {
             'quant_algo': None,
             'kv_cache_quant_algo': None,
@@ -160,9 +170,15 @@ def model_to_trtllm_ckpt(
         "transformer.ln_f.bias",
     }
 
+    gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node
+
     for i in range(world_size):
         mapping = tensorrt_llm.Mapping(
-            world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size
+            world_size=world_size,
+            rank=i,
+            tp_size=tensor_parallel_size,
+            pp_size=pipeline_parallel_size,
+            gpus_per_node=gpus_per_node,
         )
         layers_range = mapping.pp_layers(num_layers)
 
@@ -174,6 +190,8 @@ def model_to_trtllm_ckpt(
             if new_key.endswith(".bin"):  # TP split
                 if new_key.endswith(f"{mapping.tp_rank}.bin"):
                     new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
+                else:
+                    continue
             if "layers" in new_key:  # PP
                 layer_num = int(new_key.split(".")[2])
                 if layer_num in layers_range:
@@ -202,15 +220,17 @@ def model_to_trtllm_ckpt(
                 weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
 
         if mapping.is_last_pp_rank():
-            weights_dict_local["lm_head.weight"] = np.ascontiguousarray(
-                split(lm_head_weight, mapping.tp_size, mapping.tp_rank)
-            )
+            if has_lm_head:
+                weights_dict_local["lm_head.weight"] = np.ascontiguousarray(
+                    split(lm_head_weight, mapping.tp_size, mapping.tp_rank)
+                )
             weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"]
 
             ln_f_bias = weights_dict.get("transformer.ln_f.bias")
             if ln_f_bias is not None:
                 weights_dict_local["transformer.ln_f.bias"] = ln_f_bias
 
+        config["gpus_per_node"] = gpus_per_node
         model_config = PretrainedConfig(**config)
         model_config.mapping = mapping
         model_configs.append(model_config)
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
index df7e43548a44..c29edc87353e 100644
--- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -158,8 +158,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
                 model_level_weights["transformer.position_embedding.weight"].append(val)
         if pp_idx == 0:
             val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-            if embedding_scaling:
-                val = val * float(math.sqrt(hidden_size))
 
             vocab_size = val.shape[0]
             if use_parallel_embedding:
@@ -171,10 +169,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
 
             val = torch_to_numpy(val.to(storage_type).cpu())
             model_level_weights["transformer.vocab_embedding.weight"].append(val)
-            if share_embeddings_and_output:
-                val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-                val = torch_to_numpy(val.to(storage_type).cpu())
-                model_level_weights["lm_head.weight"].append(val)
         if has_lm_head and pp_idx == training_pp_size - 1:
             val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
             val = torch_to_numpy(val.to(storage_type).cpu())
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index bbafec319fd5..ef9a14c1d582 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -19,7 +19,7 @@
 from tensorrt_llm.builder import BuildConfig, Builder
 from tensorrt_llm.commands.build import build as build_trtllm
 from tensorrt_llm.logger import logger
-from tensorrt_llm.lora_manager import LoraBuildConfig
+from tensorrt_llm.lora_manager import LoraConfig
 from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights
 from tensorrt_llm.plugin import PluginConfig
 
@@ -94,7 +94,7 @@ def build_and_save_engine(
 
     if use_lora_plugin is not None:
         build_config.plugin_config.set_lora_plugin(use_lora_plugin)
-        lora_config = LoraBuildConfig(
+        lora_config = LoraConfig(
             lora_dir=lora_ckpt_list,
             lora_ckpt_source='nemo',
             max_lora_rank=max_lora_rank,
diff --git a/tests/export/test_nemo_export.py b/tests/export/nemo_export.py
similarity index 94%
rename from tests/export/test_nemo_export.py
rename to tests/export/nemo_export.py
index bac592c90cc2..5541cc0f8673 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/nemo_export.py
@@ -128,6 +128,7 @@ def run_trt_llm_inference(
     trt_llm_model_dir,
     n_gpu=1,
     max_batch_size=8,
+    use_embedding_sharing=False,
     max_input_len=128,
     max_output_len=128,
     ptuning=False,
@@ -216,6 +217,7 @@ def run_trt_llm_inference(
             lora_target_modules=lora_target_modules,
             max_num_tokens=int(max_input_len * max_batch_size * 0.2),
             opt_num_tokens=60,
+            use_embedding_sharing=use_embedding_sharing,
             save_nemo_model_config=True,
         )
 
@@ -237,6 +239,14 @@ def run_trt_llm_inference(
             stop_words_list=stop_words_list,
         )
 
+        if not use_lora_plugin and not ptuning:
+            test_cpp_runtime(
+                engine_path=trt_llm_model_dir,
+                prompt=prompt,
+                max_output_len=max_output_len,
+                debug=True,
+            )
+
         nq = None
         nm = None
         output_deployed = ""
@@ -290,6 +300,27 @@ def run_trt_llm_inference(
         raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
 
 
+def test_cpp_runtime(
+    engine_path,
+    prompt,
+    max_output_len,
+    debug,
+):
+    trt_llm_exporter = TensorRTLLM(engine_path, load_model=True)
+    output = trt_llm_exporter.forward(
+        input_texts=prompt,
+        max_output_len=max_output_len,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+    )
+
+    if debug:
+        print("")
+        print("--- Output deployed with cpp runtime: ", output)
+        print("")
+
+
 def run_existing_checkpoints(
     model_name,
     n_gpus,
@@ -332,6 +363,12 @@ def run_existing_checkpoints(
         else:
             raise Exception("There is not lora checkpoint path defined.")
 
+    if model_info["model_type"] == "gemma":
+        print("*********************")
+        use_embedding_sharing = True
+    else:
+        use_embedding_sharing = False
+
     return run_trt_llm_inference(
         model_name=model_name,
         model_type=model_info["model_type"],
@@ -340,6 +377,7 @@ def run_existing_checkpoints(
         trt_llm_model_dir=model_info["trt_llm_model_dir"],
         n_gpu=n_gpus,
         max_batch_size=model_info["max_batch_size"],
+        use_embedding_sharing=use_embedding_sharing,
         max_input_len=512,
         max_output_len=model_info["max_output_len"],
         ptuning=ptuning,
diff --git a/tests/export/run.sh b/tests/export/run.sh
index 0071b1351113..b3badd25a8f9 100644
--- a/tests/export/run.sh
+++ b/tests/export/run.sh
@@ -20,32 +20,32 @@ for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
 set +x
 
 
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
-python tests/export/test_nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8
-python tests/export/test_nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1
\ No newline at end of file
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2
+python tests/export/nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8
+python tests/export/nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1
\ No newline at end of file

From a01fa6d5f569d18ddf79bcb8cbe64193ac52b634 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Wed, 12 Jun 2024 22:22:54 -0400
Subject: [PATCH 17/25] In-framework deployment (#9438)

* initial MegatronGPTDeployable class

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* delete old comment

* first draft of MegatronGPTDeployable test script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* small cleanup of test_triton_deployable.py

* move MegatronGPTDeployable into nlp folder since it is language specific

* update test_triton_deployable for new MegatronGPTDeployable location

* renaming NemoQueryLLM classes

* MegatronGPTDeployable should programatically generate input/output fields from the relevant internal classes instead of hard-coding whenever possible

* add NemoTritonQueryLLMPyTorch class and example

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* MegatronGPTModel should always load on creation, also allow number of gpus to be controlled via argument

* got logprobs working, but can only process one prompt at a time

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add nemo deployable to deploy_triton.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* multigpu working, with manual torch.distributed calls

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rename MegatronGPTDeployable to MegatronLLMDeployable

* MegatronGPTDeployable->MegatronLLMDeployable rename for filenames

* move torch.distributed calls inside MegatronLLMDeployable

* add constructor for existing model class, tested working with Mistral7B and Nemotron3-22B

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rename test_triton_deployable.py to tests_pytriton_deploy.py

* cleanup, comments, and style guide fixes

* add warning for multigpu cases where users will need to be aware of pytorch lightning DDP behavior

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixing formatting of logprob outputs

* fix single gpu behavior, and add padding to outputs to allow for multi-prompt logprob calculation

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* fixing codeQL issues

* Apply isort and black reformatting

Signed-off-by: jukim-nv <jukim-nv@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* removed min_length definition in previous commit but forgot to remove its use

* update comments and arguments in deploy/nlp/query_llm.py

* Apply isort and black reformatting

Signed-off-by: jukim-nv <jukim-nv@users.noreply.github.com>

* delete unused arguments from test_pytriton_deploy.py

* remove some debug prints from megatronllm_deployable

* rename test file due to pytest issue

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Signed-off-by: jukim-nv <jukim-nv@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Co-authored-by: Justin Kim <jukim@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: jukim-nv <jukim-nv@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/deploy/nlp/__init__.py               |   4 +-
 nemo/deploy/nlp/megatronllm_deployable.py | 316 ++++++++++++++++++++++
 scripts/deploy/nlp/deploy_triton.py       |  75 ++---
 tests/deploy/pytriton_deploy.py           | 136 ++++++++++
 4 files changed, 498 insertions(+), 33 deletions(-)
 create mode 100644 nemo/deploy/nlp/megatronllm_deployable.py
 create mode 100644 tests/deploy/pytriton_deploy.py

diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py
index 21e2ca2751f8..52d5b3dbff3e 100644
--- a/nemo/deploy/nlp/__init__.py
+++ b/nemo/deploy/nlp/__init__.py
@@ -15,6 +15,8 @@
 
 use_query_llm = True
 try:
-    from nemo.deploy.nlp.query_llm import NemoQueryLLM
+    from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMTensorRT
 except Exception:
     use_query_llm = False
+
+from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py
new file mode 100644
index 000000000000..c27bbbd0102b
--- /dev/null
+++ b/nemo/deploy/nlp/megatronllm_deployable.py
@@ -0,0 +1,316 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from enum import IntEnum, auto
+from pathlib import Path
+
+import numpy as np
+import torch
+import wrapt
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.text_generation_utils import (
+    OutputType,
+    get_default_length_params,
+    get_default_sampling_params,
+)
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+from nemo.deploy import ITritonDeployable
+from nemo.deploy.utils import cast_output, str_ndarray2list
+
+
+@wrapt.decorator
+def noop_decorator(func):
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+use_pytriton = True
+batch = noop_decorator
+try:
+    from pytriton.decorators import batch
+    from pytriton.model_config import Tensor
+except Exception:
+    use_pytriton = False
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def GetTensorShape(pyvalue):
+    """
+    utility function to get Triton Tensor shape from a python value
+    assume that lists are shape -1 and all others are scalars with shape 1
+    """
+    return (-1 if type(pyvalue) == list else 1,)
+
+
+def GetNumpyDtype(pyvalue):
+    """
+    utility function to get numpy dtype of a python value
+    e.g. bool -> np.bool_
+    """
+    '''
+    manually defining the mapping of python type -> numpy type for now
+    is there a better way to do it?  tried np.array(pyvalue).dtype, but that doesn't seem to work
+    '''
+    py_to_numpy_mapping = {str: bytes, bool: np.bool_, float: np.single, int: np.int_}
+    python_type = type(pyvalue)
+    # for lists, return the type of the internal elements
+    if python_type == list:
+        python_type = type(pyvalue[0])
+    numpy_type = py_to_numpy_mapping[python_type]
+    return numpy_type
+
+
+class ServerSync(IntEnum):
+    """Enum for synchronization messages using torch.distributed"""
+
+    WAIT = auto()
+    SIGNAL = auto()
+
+    def to_long_tensor(self):
+        return torch.tensor([self], dtype=torch.long, device='cuda')
+
+
+class MegatronLLMDeployable(ITritonDeployable):
+    """Triton inference server compatible deploy class for a .nemo model file"""
+
+    def __init__(
+        self,
+        nemo_checkpoint_filepath: str = None,
+        num_devices: int = 1,
+        num_nodes: int = 1,
+        existing_model: MegatronGPTModel = None,
+    ):
+        if nemo_checkpoint_filepath is None and existing_model is None:
+            raise ValueError(
+                "MegatronLLMDeployable requires either a .nemo checkpoint filepath or an existing MegatronGPTModel, but both provided were None"
+            )
+        if num_devices > 1:
+            LOGGER.warning(
+                "Creating a MegatronLLMDeployable with num_devices>1 will assume running with a PyTorch Lightning DDP-variant strategy, which will run the main script once per device. Make sure any user code is compatible with multiple executions!"
+            )
+
+        # if both existing_model and nemo_checkpoint_filepath are provided, existing_model will take precedence
+        if existing_model is not None:
+            self.model = existing_model
+        else:
+            self._load_from_nemo_checkpoint(nemo_checkpoint_filepath, num_devices, num_nodes)
+
+        self.model.eval()
+        # helper threads spawned by torch.multiprocessing should loop inside this helper function
+        self._helper_thread_evaluation_loop()
+
+    def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices: int, num_nodes: int):
+        if Path(nemo_checkpoint_filepath).exists():
+            trainer = Trainer(
+                strategy=NLPDDPStrategy(),
+                devices=num_devices,
+                num_nodes=num_nodes,
+            )
+
+            custom_config = MegatronGPTModel.restore_from(
+                nemo_checkpoint_filepath, trainer=trainer, return_config=True
+            )
+            # transformer_engine should always be true according to EricH, but GPT-2B model will fail if it is enabled
+            custom_config.transformer_engine = True
+            # using multi-gpu for tensor parallelism directly for now, could do pipeline parallel instead or a combination
+            custom_config.tensor_model_parallel_size = num_devices
+            # had to override these to make Nemotron3-22B work, see sample_sequence_batch() in text_generation_utils.py
+            custom_config.activations_checkpoint_granularity = None
+            custom_config.activations_checkpoint_method = None
+
+            self.model = MegatronGPTModel.restore_from(
+                nemo_checkpoint_filepath, trainer=trainer, override_config_path=custom_config
+            )
+
+    def _helper_thread_evaluation_loop(self):
+        # only deploy the server on main thread, other threads enter this evaluation loop
+        if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
+            while True:
+                wait_value = ServerSync.WAIT.to_long_tensor()
+                torch.distributed.broadcast(wait_value, 0)
+                if wait_value.item() == ServerSync.SIGNAL:
+                    self.model.generate(inputs=[""], length_params=None)
+
+    _INPUT_PARAMETER_FIELDS = {
+        "prompts": (-1, bytes, False),
+    }
+
+    '''
+    there is no get_default equivalent for OutputType like there is for SamplingParameters and LengthParameters
+    but we still want to generate output using a real OutputType TypedDict for static type checking
+    '''
+    _BLANK_OUTPUTTYPE: OutputType = {
+        'sentences': [""],
+        'tokens': [[""]],
+        'logprob': [[0.0]],
+        'full_logprob': [[0.0]],
+        'token_ids': [[0]],
+        'offsets': [[0]],
+    }
+
+    @property
+    def get_triton_input(self):
+        input_parameters = tuple(
+            Tensor(name=name, shape=(shape,), dtype=dtype, optional=optional)
+            for name, (shape, dtype, optional) in self._INPUT_PARAMETER_FIELDS.items()
+        )
+        '''
+        in theory, would like to use typedict2tensor() function to generate Tensors, but it purposely ignores 1D arrays
+        asked JakubK why on 2024-04-26, but he doesn't know who owns the code
+        sampling_parameters = typedict2tensor(SamplingParam)
+        length_parameters = typedict2tensor(LengthParam)
+        '''
+        default_sampling_params: SamplingParam = get_default_sampling_params()
+        sampling_parameters = tuple(
+            Tensor(
+                name=parameter_name,
+                shape=GetTensorShape(parameter_value),
+                dtype=GetNumpyDtype(parameter_value),
+                optional=True,
+            )
+            for parameter_name, parameter_value in default_sampling_params.items()
+        )
+        default_length_params: LengthParam = get_default_length_params()
+        length_parameters = tuple(
+            Tensor(
+                name=parameter_name,
+                shape=GetTensorShape(parameter_value),
+                dtype=GetNumpyDtype(parameter_value),
+                optional=True,
+            )
+            for parameter_name, parameter_value in default_length_params.items()
+        )
+
+        inputs = input_parameters + sampling_parameters + length_parameters
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        # outputs are defined by the fields of OutputType
+        outputs = [
+            Tensor(
+                name=parameter_name,
+                shape=GetTensorShape(parameter_value),
+                dtype=GetNumpyDtype(parameter_value[0]),
+            )
+            for parameter_name, parameter_value in MegatronLLMDeployable._BLANK_OUTPUTTYPE.items()
+        ]
+        return outputs
+
+    @staticmethod
+    def _sampling_params_from_triton_inputs(**inputs: np.ndarray):
+        """Extract SamplingParam fields from triton input dict"""
+        sampling_params: SamplingParam = get_default_sampling_params()
+        for sampling_param_field in sampling_params.keys():
+            if sampling_param_field in inputs:
+                sampling_params[sampling_param_field] = inputs.pop(sampling_param_field)[0][0]
+        return sampling_params
+
+    @staticmethod
+    def _length_params_from_triton_inputs(**inputs: np.ndarray):
+        """Extract LengthParam fields from triton input dict"""
+        length_params: LengthParam = get_default_length_params()
+        for length_param_field in length_params.keys():
+            if length_param_field in inputs:
+                length_params[length_param_field] = inputs.pop(length_param_field)[0][0]
+        return length_params
+
+    @batch
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        """Triton server inference function that actually runs the model"""
+        if torch.distributed.is_initialized():
+            distributed_rank = torch.distributed.get_rank()
+            if distributed_rank != 0:
+                raise ValueError(
+                    f"Triton inference function should not be called on a thread with torch.distributed rank != 0, but this thread is rank {distributed_rank}"
+                )
+            signal_value = ServerSync.SIGNAL.to_long_tensor()
+            torch.distributed.broadcast(signal_value, 0)
+
+        input_strings = str_ndarray2list(inputs.pop("prompts"))
+        sampling_params = self._sampling_params_from_triton_inputs(**inputs)
+        length_params = self._length_params_from_triton_inputs(**inputs)
+
+        model_output = self.model.generate(
+            inputs=input_strings, length_params=length_params, sampling_params=sampling_params
+        )
+        '''
+            model_output['sentences'] will be a list of strings (one per prompt)
+            other fields will either be a list of lists (tokens, for example)
+            or a list of pytorch Tensor
+        '''
+
+        triton_output = {}
+        _OUTPUT_FILLER_VALUES = {
+            'tokens': "",
+            'logprob': 0.0,
+            'full_logprob': 0.0,
+            'token_ids': -1,
+            'offsets': -1,
+        }
+        for model_output_field, value in model_output.items():
+
+            if model_output_field != 'sentences' and value is not None:
+                # find length of longest non-sentence output item
+                field_longest_output_item = 0
+                for item in value:
+                    field_longest_output_item = max(field_longest_output_item, len(item))
+                # then pad shorter items to match this length
+                for index, item in enumerate(value):
+                    num_pad_values = field_longest_output_item - len(item)
+                    if num_pad_values > 0:
+                        pad_value = _OUTPUT_FILLER_VALUES[model_output_field]
+                        if isinstance(item, torch.Tensor):
+                            pad_tensor = torch.full(
+                                (num_pad_values, item.size(1)) if item.dim() > 1 else (num_pad_values,),
+                                pad_value,
+                                dtype=item.dtype,
+                                device='cuda',
+                            )
+                            padded_item = torch.cat((item, pad_tensor))
+                            value[index] = padded_item
+                        else:
+                            pad_list = [pad_value] * num_pad_values
+                            padded_item = item + pad_list
+                            value[index] = padded_item
+
+            field_dtype = GetNumpyDtype(MegatronLLMDeployable._BLANK_OUTPUTTYPE[model_output_field][0])
+            if value is None:
+                # triton does not allow for optional output parameters, so need to populate them if they don't exist
+                triton_output[model_output_field] = np.full(
+                    # 'sentences' should always have a valid value, so use that for the output shape
+                    np.shape(model_output['sentences']),
+                    MegatronLLMDeployable._BLANK_OUTPUTTYPE[model_output_field][0],
+                    dtype=field_dtype,
+                )
+            elif field_dtype == bytes:
+                # strings are cast to bytes
+                triton_output[model_output_field] = cast_output(value, field_dtype)
+            elif isinstance(value[0], torch.Tensor):
+                if value[0].dtype == torch.bfloat16:
+                    # numpy currently does not support bfloat16, so need to manually convert it
+                    triton_output[model_output_field] = np.array([tensor.cpu().float().numpy() for tensor in value])
+                else:
+                    triton_output[model_output_field] = np.array([tensor.cpu().numpy() for tensor in value])
+            else:
+                # non-strings are output as-is (in numpy format)
+                triton_output[model_output_field] = np.array(value)
+        return triton_output
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 0f7866e57cda..835ff46dd5fe 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -19,9 +19,9 @@
 from pathlib import Path
 
 from nemo.deploy import DeployPyTriton
+from nemo.deploy.nlp import MegatronLLMDeployable
 from nemo.export import TensorRTLLM
 
-
 LOGGER = logging.getLogger("NeMo")
 
 
@@ -31,6 +31,13 @@ def get_args(argv):
         description=f"Deploy nemo models to Triton",
     )
     parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
+    parser.add_argument(
+        "-dsn",
+        "--direct_serve_nemo",
+        default=False,
+        action='store_true',
+        help="Serve the nemo model directly instead of exporting to TRTLLM first. Will ignore other TRTLLM-specific arguments.",
+    )
     parser.add_argument(
         "-ptnc",
         "--ptuning_nemo_checkpoint",
@@ -146,18 +153,7 @@ def get_args(argv):
     return args
 
 
-def nemo_deploy(argv):
-    args = get_args(argv)
-
-    if args.debug_mode:
-        loglevel = logging.DEBUG
-    else:
-        loglevel = logging.INFO
-
-    LOGGER.setLevel(loglevel)
-    LOGGER.info("Logging level set to {}".format(loglevel))
-    LOGGER.info(args)
-
+def get_trtllm_deployable(args):
     if args.triton_model_repository is None:
         trt_llm_path = "/tmp/trt_llm_model_dir/"
         LOGGER.info(
@@ -170,28 +166,24 @@ def nemo_deploy(argv):
         trt_llm_path = args.triton_model_repository
 
     if args.nemo_checkpoint is None and args.triton_model_repository is None:
-        LOGGER.error(
+        raise ValueError(
             "The provided model repository is not a valid TensorRT-LLM model "
             "directory. Please provide a --nemo_checkpoint."
         )
-        return
 
     if args.nemo_checkpoint is None and not os.path.isdir(args.triton_model_repository):
-        LOGGER.error(
+        raise ValueError(
             "The provided model repository is not a valid TensorRT-LLM model "
             "directory. Please provide a --nemo_checkpoint."
         )
-        return
 
     if args.nemo_checkpoint is not None and args.model_type is None:
-        LOGGER.error("Model type is required to be defined if a nemo checkpoint is provided.")
-        return
+        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
 
     ptuning_tables_files = []
     if not args.ptuning_nemo_checkpoint is None:
         if args.max_prompt_embedding_table_size is None:
-            LOGGER.error("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).")
-            return
+            raise ValueError("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).")
 
         for pt_checkpoint in args.ptuning_nemo_checkpoint:
             ptuning_nemo_checkpoint_path = Path(pt_checkpoint)
@@ -199,19 +191,16 @@ def nemo_deploy(argv):
                 if ptuning_nemo_checkpoint_path.is_file():
                     ptuning_tables_files.append(pt_checkpoint)
                 else:
-                    LOGGER.error("Could not read the prompt tuning tables from {0}".format(pt_checkpoint))
-                    return
+                    raise IsADirectoryError("Could not read the prompt tuning tables from {0}".format(pt_checkpoint))
             else:
-                LOGGER.error("File or directory {0} does not exist.".format(pt_checkpoint))
-                return
+                raise FileNotFoundError("File or directory {0} does not exist.".format(pt_checkpoint))
 
         if args.task_ids is not None:
             if len(ptuning_tables_files) != len(args.task_ids):
-                LOGGER.error(
+                raise RuntimeError(
                     "Number of task ids and prompt embedding tables have to match. "
                     "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids))
                 )
-                return
 
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
@@ -245,8 +234,7 @@ def nemo_deploy(argv):
                 save_nemo_model_config=True,
             )
         except Exception as error:
-            LOGGER.error("An error has occurred during the model export. Error message: " + str(error))
-            return
+            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
 
     try:
         for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files):
@@ -265,12 +253,35 @@ def nemo_deploy(argv):
                 prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
             )
     except Exception as error:
-        LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error))
-        return
+        raise RuntimeError(
+            "An error has occurred during adding the prompt embedding table(s). Error message: " + str(error)
+        )
+    return trt_llm_exporter
+
+
+def get_nemo_deployable(args):
+    if args.nemo_checkpoint is None:
+        raise ValueError("Direct serve requires a .nemo checkpoint")
+    return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus)
+
+
+def nemo_deploy(argv):
+    args = get_args(argv)
+
+    if args.debug_mode:
+        loglevel = logging.DEBUG
+    else:
+        loglevel = logging.INFO
+
+    LOGGER.setLevel(loglevel)
+    LOGGER.info("Logging level set to {}".format(loglevel))
+    LOGGER.info(args)
+
+    triton_deployable = get_nemo_deployable(args) if args.direct_serve_nemo else get_trtllm_deployable(args)
 
     try:
         nm = DeployPyTriton(
-            model=trt_llm_exporter,
+            model=triton_deployable,
             triton_model_name=args.triton_model_name,
             triton_model_version=args.triton_model_version,
             max_batch_size=args.max_batch_size,
diff --git a/tests/deploy/pytriton_deploy.py b/tests/deploy/pytriton_deploy.py
new file mode 100644
index 000000000000..3b722d2d7fec
--- /dev/null
+++ b/tests/deploy/pytriton_deploy.py
@@ -0,0 +1,136 @@
+import argparse
+
+import numpy as np
+from pytriton.client import ModelClient
+
+from nemo.deploy.deploy_pytriton import DeployPyTriton
+from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
+from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMPyTorch
+
+
+def test_triton_deployable(args):
+    megatron_deployable = MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus)
+
+    prompts = ["What is the biggest planet in the solar system?", "What is the fastest steam locomotive in history?"]
+    url = "localhost:8000"
+    model_name = args.model_name
+    init_timeout = 600.0
+
+    nm = DeployPyTriton(
+        model=megatron_deployable,
+        triton_model_name=model_name,
+        triton_model_version=1,
+        max_batch_size=8,
+        port=8000,
+        address="0.0.0.0",
+        streaming=False,
+    )
+    nm.deploy()
+    nm.run()
+
+    # run once with NemoTritonQueryLLMPyTorch
+    nemo_triton_query = NemoTritonQueryLLMPyTorch(url, model_name)
+
+    result_dict = nemo_triton_query.query_llm(
+        prompts,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        temperature=args.temperature,
+        max_length=args.max_output_token,
+        init_timeout=init_timeout,
+    )
+    print("NemoTritonQueryLLMPyTriton result:")
+    print(result_dict)
+
+    # run once with ModelClient, the results should be identical
+    str_ndarray = np.array(prompts)[..., np.newaxis]
+    prompts = np.char.encode(str_ndarray, "utf-8")
+    max_output_token = np.full(prompts.shape, args.max_output_token, dtype=np.int_)
+    top_k = np.full(prompts.shape, args.top_k, dtype=np.int_)
+    top_p = np.full(prompts.shape, args.top_p, dtype=np.single)
+    temperature = np.full(prompts.shape, args.temperature, dtype=np.single)
+
+    with ModelClient(url, model_name, init_timeout_s=init_timeout) as client:
+        result_dict = client.infer_batch(
+            prompts=prompts,
+            max_length=max_output_token,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+        )
+        print("ModelClient result:")
+        print(result_dict)
+
+    # test logprobs generation
+    # right now we don't support batches where output data is inconsistent in size, so submitting each prompt individually
+    all_probs = np.full(prompts.shape, True, dtype=np.bool_)
+    compute_logprob = np.full(prompts.shape, True, dtype=np.bool_)
+    with ModelClient(url, model_name, init_timeout_s=init_timeout) as client:
+        logprob_results = client.infer_batch(
+            prompts=prompts,
+            max_length=max_output_token,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            all_probs=all_probs,
+            compute_logprob=compute_logprob,
+        )
+        print("Logprob results:")
+        print(logprob_results)
+
+    nm.stop()
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Deploy nemo models to Triton and benchmark the models",
+    )
+
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--nemo_checkpoint",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--max_output_token",
+        type=int,
+        default=128,
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.0,
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = get_args()
+    test_triton_deployable(args)

From e00ba0bbff06ac2bc9736288f031f7e33009609e Mon Sep 17 00:00:00 2001
From: ashors1 <71393111+ashors1@users.noreply.github.com>
Date: Thu, 13 Jun 2024 01:38:00 -0700
Subject: [PATCH 18/25] [NeMo-UX] Add nsys callback (#9461)

* add nsys callback

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

---------

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
---
 nemo/lightning/pytorch/callbacks/nsys.py | 69 ++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 nemo/lightning/pytorch/callbacks/nsys.py

diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
new file mode 100644
index 000000000000..f50fe0481e9d
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -0,0 +1,69 @@
+from typing import Any, List, Optional
+
+import torch
+from pytorch_lightning.callbacks.callback import Callback
+
+from nemo.utils import logging
+from nemo.utils.get_rank import get_rank
+
+
+class NsysCallback(Callback):
+
+    def __init__(
+        self,
+        start_step: int,
+        end_step: int,
+        ranks: List[int] = [0],
+        gen_shape: bool = False,
+    ):
+        """
+        Args:
+            start_step (int): Global batch to start profiling
+            end_step (int): Global batch to end profiling
+            ranks (List[int]): Global rank IDs to profile
+            gen_shape (bool): Generate model and kernel details including input shapes
+        """
+        assert type(start_step) == int, f'Nsys start_step must be of type int. Found: {type(start_step)}'
+        self._nsys_profile_start_step = start_step
+
+        assert type(end_step) == int, f'Nsys end_step must be of type int. Found: {type(start_step)}'
+        self._nsys_profile_end_step = end_step
+
+        assert (
+            self._nsys_profile_end_step >= self._nsys_profile_start_step
+        ), f'Nsys end_step must be greater than or equal to nsys start_step'
+
+        self._nsys_profile_ranks = ranks
+        self._nsys_profile_gen_shape = gen_shape
+
+        logging.info(
+            f'Nsys profiling setup with start_step: {self._nsys_profile_start_step},'
+            f'and end_step: {self._nsys_profile_end_step}'
+        )
+
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Optional[int]:
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start
+        We use it here to enable nsys profiling.
+        """
+
+        device = trainer.strategy.root_device
+        if device.type == 'cuda':
+            if batch_idx == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks:
+                logging.info("====== Start nsys profiling ======")
+                torch.cuda.cudart().cudaProfilerStart()
+                if self._nsys_profile_gen_shape:
+                    torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int) -> None:
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end
+        We use it here to enable nsys profiling.
+        """
+
+        device = trainer.strategy.root_device
+        if device.type == 'cuda':
+            print(f'batch idx: {batch_idx}')
+            if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
+                logging.info("====== End nsys profiling ======")
+                torch.cuda.cudart().cudaProfilerStop()

From 5fa95ce370dc02bae12845cad47409a1ac147ae4 Mon Sep 17 00:00:00 2001
From: "John St. John" <jstjohn@users.noreply.github.com>
Date: Thu, 13 Jun 2024 07:14:24 -0700
Subject: [PATCH 19/25] Fix the megatron cyclic sampler (#9458)

---
 nemo/lightning/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
index 88e2f3436699..adfc0aa14d29 100644
--- a/nemo/lightning/data.py
+++ b/nemo/lightning/data.py
@@ -103,7 +103,6 @@ def add_megatron_sampler(
         )
     elif dataloader_type == 'cyclic':
         batch_sampler = MegatronPretrainingRandomSampler(
-            dataloader.dataset,
             total_samples=len(dataloader.dataset),
             consumed_samples=consumed_samples,
             micro_batch_size=micro_batch_size,
@@ -259,8 +258,9 @@ def __iter__(self):
         assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0
 
         # data sharding and random sampling
+        data_parallel_size = self.micro_batch_times_data_parallel_size // self.micro_batch_size
         bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) * self.micro_batch_size
-        bucket_offset = current_epoch_samples // self.data_parallel_size
+        bucket_offset = current_epoch_samples // data_parallel_size
         start_idx = self.data_parallel_rank * bucket_size
 
         g = torch.Generator()

From 0b128071b7f66218ebb3694ebe99b6b0ca77ff7d Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Thu, 13 Jun 2024 16:22:34 +0200
Subject: [PATCH 20/25] [NeMo UX] Introducing optimizer module (#9454)

* Trying to add support for mcore

* Introducing OptimizerModule & LRSchedulerModule

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Remove some un-used code

* Make design more robust

* Trying to fix failing megatron_parallel tests

* Introducing OptimizerModule & LRSchedulerModule

* Removing un-used import

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Adding lr-schedulers

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix bug with setting finalize_model_grads

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
---
 nemo/collections/llm/api.py                |   8 +-
 nemo/collections/llm/gpt/model/base.py     |  28 +-
 nemo/lightning/__init__.py                 |   4 +
 nemo/lightning/megatron_parallel.py        |   1 -
 nemo/lightning/optim.py                    |  66 ----
 nemo/lightning/pytorch/opt/__init__.py     |  32 ++
 nemo/lightning/pytorch/opt/base.py         | 179 ++++++++++
 nemo/lightning/pytorch/opt/lr_scheduler.py | 390 +++++++++++++++++++++
 nemo/lightning/pytorch/opt/megatron.py     |  97 +++++
 nemo/lightning/pytorch/strategies.py       |   6 +-
 10 files changed, 717 insertions(+), 94 deletions(-)
 delete mode 100644 nemo/lightning/optim.py
 create mode 100644 nemo/lightning/pytorch/opt/__init__.py
 create mode 100644 nemo/lightning/pytorch/opt/base.py
 create mode 100644 nemo/lightning/pytorch/opt/lr_scheduler.py
 create mode 100644 nemo/lightning/pytorch/opt/megatron.py

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 824d84ffb461..fdcfbda047c8 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -4,7 +4,7 @@
 import pytorch_lightning as pl
 
 from nemo.collections.llm.utils import task
-from nemo.lightning import MegatronStrategy, Trainer, io, teardown
+from nemo.lightning import MegatronStrategy, OptimizerModule, Trainer, io, teardown
 
 
 @task(namespace="llm")
@@ -12,6 +12,7 @@ def train(
     model: pl.LightningModule,
     data: pl.LightningDataModule,
     trainer: Trainer,
+    opt: Optional[OptimizerModule] = None,
     tokenizer: Optional[str] = None,
     source: Optional[str] = None,
     export: Optional[str] = None,
@@ -23,6 +24,8 @@ def train(
         model (pl.LightningModule): The model to be trained.
         data (pl.LightningDataModule): The data module containing training data.
         trainer (Trainer): The trainer instance configured with a MegatronStrategy.
+        opt (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
+            from the model will be used.
         tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'.
         source (Optional[str]): Path to a checkpoint from which to continue training.
         export (Optional[str]): Filename to save the exported checkpoint after training.
@@ -58,6 +61,9 @@ def train(
     if source:
         _add_ckpt_path(source, model, fit_kwargs)
 
+    if opt:
+        opt.connect(model)
+
     trainer.fit(model, data, **fit_kwargs)
 
     print(f"Saving checkpoint to: {export_dir}")
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 9f5c23493d03..e577ddb63d26 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -1,18 +1,16 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
+from typing import TYPE_CHECKING, Dict, Literal, Optional
 
 import pytorch_lightning as L
 import torch
 import torch.distributed
+from megatron.core.optimizer import OptimizerConfig
 from megatron.core.transformer.transformer_config import TransformerConfig
-from pytorch_lightning.utilities.types import OptimizerLRScheduler
-from torch import nn
-from torch.optim import Optimizer
 
 from nemo.collections.llm import fn
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
-from nemo.lightning.optim import MegatronOptim, OptimizerConfig
+from nemo.lightning.pytorch.opt import MegatronOptimizerModule, OptimizerModule
 
 if TYPE_CHECKING:
     from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
@@ -70,20 +68,18 @@ def __init__(
         self,
         config: GPTConfig,
         # TODO: Add transformer_layer_spec when we update mcore
-        optim: Optional[Union[MegatronOptim, Callable[[nn.Module], OptimizerLRScheduler]]] = None,
+        optim: Optional[OptimizerModule] = None,
         tokenizer: Optional["TokenizerSpec"] = None,
     ):
         super().__init__()
         self.config = config
         self.tokenizer = tokenizer
-        self.optim = optim or MegatronOptim(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
+        self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
+        self.optim.connect(self)  # This will bind the `configure_optimizers` method
 
     def configure_model(self) -> None:
         self.module = self.config.configure_model(self.tokenizer)
 
-    def configure_optimizers(self, megatron_parallel=None):
-        return self.optim(megatron_parallel or self)
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -171,16 +167,6 @@ def gpt_forward_step(model, batch) -> torch.Tensor:
     return model(**forward_args)
 
 
-def gpt_default_optimizer(module) -> Optimizer:
-    # from apex.optimizers import FusedAdam
-
-    from megatron.core.optimizer import OptimizerConfig
-
-    return OptimizerConfig(lr=1e-4)
-
-    # return FusedAdam(module.parameters(), lr=1e-4)
-
-
 def get_batch_on_this_context_parallel_rank(batch):
     from megatron.core import parallel_state
 
@@ -233,4 +219,4 @@ def get_packed_seq_params(batch):
     )
 
 
-__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step", "gpt_default_optimizer"]
+__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step"]
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index e54f223f91cc..31559ad9a81a 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -10,6 +10,7 @@
     pass
 
 from nemo.lightning.base import get_vocab_size, teardown
+from nemo.lightning.pytorch.opt import LRSchedulerModule, MegatronOptimizerModule, OptimizerModule
 from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
 from nemo.lightning.pytorch.strategies import MegatronStrategy
@@ -29,9 +30,12 @@ def _is_slurm_interactive_mode():
 
 
 __all__ = [
+    "LRSchedulerModule",
     "MegatronStrategy",
     "MegatronDataSampler",
     "MegatronMixedPrecision",
+    "MegatronOptimizerModule",
+    "OptimizerModule",
     "Trainer",
     "get_vocab_size",
     "teardown",
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 12a9da97c342..3172d242e681 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -3,7 +3,6 @@
 import functools
 import inspect
 import queue
-import types
 from collections import defaultdict
 from typing import (
     Any,
diff --git a/nemo/lightning/optim.py b/nemo/lightning/optim.py
deleted file mode 100644
index d706680776bc..000000000000
--- a/nemo/lightning/optim.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Optional
-
-from megatron.core.distributed import finalize_model_grads
-from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
-from megatron.core.utils import get_model_config
-from pytorch_lightning.utilities.types import OptimizerLRScheduler
-from torch.optim import Optimizer
-
-if TYPE_CHECKING:
-    from nemo.lightning.megatron_parallel import MegatronParallel
-
-
-@dataclass
-class MegatronOptim:
-    config: OptimizerConfig
-    finalize_model_grads: Callable = finalize_model_grads
-
-    def create_optimizer(
-        self,
-        megatron_parallel: "MegatronParallel",
-        no_weight_decay_cond: Optional[Callable] = None,
-        scale_lr_cond: Optional[Callable] = None,
-        lr_mult: float = 1.0,
-    ) -> Optimizer:
-        from nemo.core.optim import McoreDistributedOptimizer
-
-        # TODO: Where should we put this?
-        get_model_config(megatron_parallel[0]).finalize_model_grads = finalize_model_grads
-
-        mcore_opt = get_megatron_optimizer(
-            self.config,
-            list(megatron_parallel),
-            no_weight_decay_cond=no_weight_decay_cond,
-            scale_lr_cond=scale_lr_cond,
-            lr_mult=lr_mult,
-        )
-
-        return McoreDistributedOptimizer(mcore_opt)
-
-    def configure_optimizer(self, megatron_parallel: "MegatronParallel") -> OptimizerLRScheduler:
-        from nemo.core.optim.lr_scheduler import CosineAnnealing
-
-        opt = self.create_optimizer(megatron_parallel)
-
-        # TODO: Make this configurable through the dataclass
-        lr_scheduler = CosineAnnealing(opt, max_steps=10, warmup_steps=750, constant_steps=80000, min_lr=int(6e-5))
-
-        return {
-            "optimizer": opt,
-            # REQUIRED: The scheduler instance
-            "scheduler": lr_scheduler,
-            # The unit of the scheduler's step size, could also be 'step'.
-            # 'epoch' updates the scheduler on epoch end whereas 'step'
-            # updates it after a optimizer update.
-            "interval": "epoch",
-            # How many epochs/steps should pass between calls to
-            # `scheduler.step()`. 1 corresponds to updating the learning
-            # rate after every epoch/step.
-            "frequency": 1,
-            # Metric to to monitor for schedulers like `ReduceLROnPlateau`
-            "monitor": "val_loss",
-        }
-
-    def __call__(self, megatron_parallel: "MegatronParallel") -> OptimizerLRScheduler:
-        return self.configure_optimizer(megatron_parallel)
diff --git a/nemo/lightning/pytorch/opt/__init__.py b/nemo/lightning/pytorch/opt/__init__.py
new file mode 100644
index 000000000000..988f40f5ca30
--- /dev/null
+++ b/nemo/lightning/pytorch/opt/__init__.py
@@ -0,0 +1,32 @@
+from nemo.lightning.pytorch.opt.base import LRSchedulerModule, OptimizerModule
+from nemo.lightning.pytorch.opt.lr_scheduler import (
+    InverseSquareRootAnnealingScheduler,
+    NoamAnnealingScheduler,
+    NoamHoldAnnealingScheduler,
+    PolynomialDecayAnnealingScheduler,
+    PolynomialHoldDecayAnnealingScheduler,
+    SquareAnnealingScheduler,
+    SquareRootAnnealingScheduler,
+    T5InverseSquareRootAnnealingScheduler,
+    WarmupAnnealingScheduler,
+    WarmupHoldPolicyScheduler,
+    WarmupPolicyScheduler,
+)
+from nemo.lightning.pytorch.opt.megatron import MegatronOptimizerModule
+
+__all__ = [
+    "OptimizerModule",
+    "LRSchedulerModule",
+    "MegatronOptimizerModule",
+    "WarmupPolicyScheduler",
+    "WarmupHoldPolicyScheduler",
+    "SquareAnnealingScheduler",
+    "SquareRootAnnealingScheduler",
+    "NoamAnnealingScheduler",
+    "NoamHoldAnnealingScheduler",
+    "WarmupAnnealingScheduler",
+    "InverseSquareRootAnnealingScheduler",
+    "T5InverseSquareRootAnnealingScheduler",
+    "PolynomialDecayAnnealingScheduler",
+    "PolynomialHoldDecayAnnealingScheduler",
+]
diff --git a/nemo/lightning/pytorch/opt/base.py b/nemo/lightning/pytorch/opt/base.py
new file mode 100644
index 000000000000..3e51cf451671
--- /dev/null
+++ b/nemo/lightning/pytorch/opt/base.py
@@ -0,0 +1,179 @@
+import types
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+import pytorch_lightning as L
+from pytorch_lightning.utilities.types import OptimizerLRScheduler
+from torch.optim import Optimizer
+
+from nemo.lightning.megatron_parallel import CallbackMethods
+
+
+class LRSchedulerModule(L.Callback, CallbackMethods, ABC):
+    """A module to standardize the learning rate scheduler setup and configuration.
+
+    This class decouples the learning rate scheduler from the model, similar to how the LightningDataModule
+    decouples data handling. It also acts as a Callback to hook into the training loop, which can be useful
+    for adding custom all-reduces, logging, early stopping, etc. Next to that standard Lightning callback-event,
+    this also supports hooking into the Megatron forward-backward function at a granular level.
+
+    Example::
+
+        class MyLRSchedulerModule(LRSchedulerModule):
+            def setup(self, model, optimizer):
+                # Custom setup logic
+                ...
+
+            def scheduler(self, model, optimizers):
+                # Define and return the learning rate scheduler
+                ...
+
+    Methods:
+        setup(model, optimizer): Sets up the learning rate scheduler.
+        scheduler(model, optimizers): Abstract method to define the learning rate scheduler.
+        __call__(model, optimizers): Calls the setup and scheduler methods.
+    """
+
+    def setup(self, model, optimizer) -> None:
+        """Sets up the learning rate scheduler.
+
+        Args:
+            model: The model for which the scheduler is being set up.
+            optimizer: The optimizer for which the scheduler is being set up.
+        """
+        ...
+
+    @abstractmethod
+    def scheduler(self, model, optimizers) -> OptimizerLRScheduler:
+        """Abstract method to define the learning rate scheduler.
+
+        Args:
+            model: The model for which the scheduler is being defined.
+            optimizers: The optimizers for which the scheduler is being defined.
+
+        Returns:
+            OptimizerLRScheduler: The learning rate scheduler.
+        """
+        raise NotImplementedError("The scheduler method should be implemented by subclasses.")
+
+    def __call__(self, model, optimizers):
+        """Calls the setup and scheduler methods.
+
+        Args:
+            model: The model for which the scheduler is being called.
+            optimizers: The optimizers for which the scheduler is being called.
+
+        Returns:
+            OptimizerLRScheduler: The learning rate scheduler.
+        """
+
+        self.setup(model, optimizers)
+
+        self._scheduler = self.scheduler(model, optimizers)
+
+        if not isinstance(self._scheduler, (dict, tuple)):
+            return optimizers, self._scheduler
+
+        return self._scheduler
+
+
+class OptimizerModule(L.Callback, CallbackMethods, ABC):
+    """A module to standardize the optimizer setup and configuration.
+
+    This class decouples the optimizer from the model, similar to how the LightningDataModule
+    decouples data handling. It also acts as a Callback to hook into the training loop, which can be useful
+    for adding custom all-reduces, logging, early stopping, etc. Next to that standard Lightning callback-event,
+    this also supports hooking into the Megatron forward-backward function at a granular level.
+
+    Attributes:
+        lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
+
+    Example::
+
+        class MyOptimizerModule(OptimizerModule):
+            def __init__(self, lr_scheduler=None):
+                super().__init__(lr_scheduler)
+
+            def setup(self, model):
+                # Custom setup logic
+                ...
+
+            def optimizers(self, model):
+                # Define and return the optimizers
+                ...
+
+    Methods:
+        connect(model, trainer): Connects the optimizer module to the model and trainer.
+        setup(model): Sets up the optimizer.
+        optimizers(model): Abstract method to define the optimizers.
+        __call__(model, megatron_parallel): Calls the setup and optimizers methods.
+    """
+
+    def __init__(self, lr_scheduler: Optional[LRSchedulerModule]):
+        """Initializes the OptimizerModule.
+
+        Args:
+            lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
+        """
+        self.lr_scheduler = lr_scheduler
+
+    def connect(self, model: L.LightningModule) -> None:
+        """Connects the optimizer module to the model and trainer.
+
+        Args:
+            model (L.LightningModule): The model to which the optimizer module is being connected.
+        """
+
+        def custom_configure_optimizers(lightning_module_self, megatron_parallel=None):
+            opt = self(lightning_module_self, megatron_parallel=megatron_parallel)
+            return opt
+
+        model.configure_optimizers = types.MethodType(custom_configure_optimizers, model)
+
+    def setup(self, model) -> None:
+        """Sets up the optimizer.
+
+        Args:
+            model: The model for which the optimizer is being set up.
+        """
+        ...
+
+    @abstractmethod
+    def optimizers(self, model) -> List[Optimizer]:
+        """Abstract method to define the optimizers.
+
+        Args:
+            model: The model for which the optimizers are being defined.
+
+        Returns:
+            List[Optimizer]: The list of optimizers.
+        """
+        raise NotImplementedError("The optimizers method should be implemented by subclasses.")
+
+    def __call__(self, model: L.LightningModule, megatron_parallel=None) -> OptimizerLRScheduler:
+        """Calls the setup and optimizers methods.
+
+        Args:
+            model (L.LightningModule): The model for which the optimizers are being called.
+            megatron_parallel: Optional parallel model.
+
+        Returns:
+            OptimizerLRScheduler: The optimizers and optionally the learning rate scheduler.
+        """
+        _model = model if megatron_parallel is None else megatron_parallel
+        callbacks = _model.trainer.callbacks
+        if self not in callbacks:
+            callbacks.append(self)
+        if self.lr_scheduler is not None and self.lr_scheduler not in callbacks:
+            callbacks.append(self.lr_scheduler)
+
+        self.setup(_model)
+        self._optimizers = self.optimizers(_model)
+
+        if self.lr_scheduler is not None:
+            self.lr_scheduler.setup(_model, self._optimizers)
+            with_scheduler = self.lr_scheduler(_model, self._optimizers)
+
+            return with_scheduler
+
+        return self._optimizers
diff --git a/nemo/lightning/pytorch/opt/lr_scheduler.py b/nemo/lightning/pytorch/opt/lr_scheduler.py
new file mode 100644
index 000000000000..1ce8dcf0d815
--- /dev/null
+++ b/nemo/lightning/pytorch/opt/lr_scheduler.py
@@ -0,0 +1,390 @@
+from typing import Optional
+
+from nemo.core.optim.lr_scheduler import (
+    InverseSquareRootAnnealing,
+    NoamAnnealing,
+    NoamHoldAnnealing,
+    PolynomialDecayAnnealing,
+    PolynomialHoldDecayAnnealing,
+    SquareAnnealing,
+    SquareRootAnnealing,
+    T5InverseSquareRootAnnealing,
+    WarmupAnnealing,
+    WarmupHoldPolicy,
+    WarmupPolicy,
+)
+from nemo.lightning.pytorch.opt.base import LRSchedulerModule
+
+
+class WarmupPolicyScheduler(LRSchedulerModule):
+    """Warmup Policy Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        warmup_steps: int = 750,
+        warmup_ratio: Optional[float] = None,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.warmup_steps = warmup_steps
+        self.warmup_ratio = warmup_ratio
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = WarmupPolicy(
+            optimizer,
+            warmup_steps=self.warmup_steps,
+            warmup_ratio=self.warmup_ratio,
+            max_steps=self.max_steps,
+            min_lr=self.min_lr,
+        )
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class WarmupHoldPolicyScheduler(LRSchedulerModule):
+    """Warmup Hold Policy Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        warmup_steps: int = 750,
+        warmup_ratio: Optional[float] = None,
+        hold_steps: Optional[int] = None,
+        hold_ratio: Optional[float] = None,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.warmup_steps = warmup_steps
+        self.warmup_ratio = warmup_ratio
+        self.hold_steps = hold_steps
+        self.hold_ratio = hold_ratio
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = WarmupHoldPolicy(
+            optimizer,
+            warmup_steps=self.warmup_steps,
+            warmup_ratio=self.warmup_ratio,
+            hold_steps=self.hold_steps,
+            hold_ratio=self.hold_ratio,
+            max_steps=self.max_steps,
+            min_lr=self.min_lr,
+        )
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class SquareAnnealingScheduler(LRSchedulerModule):
+    """Square Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        min_lr: float = 1e-5,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = SquareAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr)
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class SquareRootAnnealingScheduler(LRSchedulerModule):
+    """Square Root Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = SquareRootAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr)
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class NoamAnnealingScheduler(LRSchedulerModule):
+    """Noam Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        d_model: int,
+        warmup_steps: int = 750,
+        warmup_ratio: Optional[float] = None,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.warmup_steps = warmup_steps
+        self.warmup_ratio = warmup_ratio
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = NoamAnnealing(
+            optimizer,
+            d_model=self.d_model,
+            warmup_steps=self.warmup_steps,
+            warmup_ratio=self.warmup_ratio,
+            max_steps=self.max_steps,
+            min_lr=self.min_lr,
+        )
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class NoamHoldAnnealingScheduler(LRSchedulerModule):
+    """Noam Hold Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        decay_rate: float = 0.5,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.decay_rate = decay_rate
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = NoamHoldAnnealing(
+            optimizer, max_steps=self.max_steps, decay_rate=self.decay_rate, min_lr=self.min_lr
+        )
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class WarmupAnnealingScheduler(LRSchedulerModule):
+    """Warmup Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = WarmupAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr)
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class InverseSquareRootAnnealingScheduler(LRSchedulerModule):
+    """Inverse Square Root Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = InverseSquareRootAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr)
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class T5InverseSquareRootAnnealingScheduler(LRSchedulerModule):
+    """T5 Inverse Square Root Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = T5InverseSquareRootAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr)
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class PolynomialDecayAnnealingScheduler(LRSchedulerModule):
+    """Polynomial Decay Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        power: float = 1.0,
+        cycle: bool = False,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.power = power
+        self.cycle = cycle
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = PolynomialDecayAnnealing(
+            optimizer, max_steps=self.max_steps, min_lr=self.min_lr, power=self.power, cycle=self.cycle
+        )
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
+
+
+class PolynomialHoldDecayAnnealingScheduler(LRSchedulerModule):
+    """Polynomial Hold Decay Annealing Learning Rate Scheduler."""
+
+    def __init__(
+        self,
+        max_steps: int = 10,
+        min_lr: float = 0.0,
+        power: float = 1.0,
+        cycle: bool = False,
+        interval: str = "epoch",
+        frequency: int = 1,
+        monitor: str = "val_loss",
+    ):
+        super().__init__()
+        self.max_steps = max_steps
+        self.min_lr = min_lr
+        self.power = power
+        self.cycle = cycle
+        self.interval = interval
+        self.frequency = frequency
+        self.monitor = monitor
+
+    def scheduler(self, optimizer):
+        lr_scheduler = PolynomialHoldDecayAnnealing(
+            optimizer, max_steps=self.max_steps, min_lr=self.min_lr, power=self.power, cycle=self.cycle
+        )
+        return {
+            "optimizer": optimizer,
+            "scheduler": lr_scheduler,
+            "interval": self.interval,
+            "frequency": self.frequency,
+            "monitor": self.monitor,
+        }
diff --git a/nemo/lightning/pytorch/opt/megatron.py b/nemo/lightning/pytorch/opt/megatron.py
new file mode 100644
index 000000000000..dff08d7a07df
--- /dev/null
+++ b/nemo/lightning/pytorch/opt/megatron.py
@@ -0,0 +1,97 @@
+from typing import Callable, List, Optional
+
+from megatron.core.distributed import finalize_model_grads
+from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+from megatron.core.utils import get_model_config
+from torch.optim import Optimizer
+
+from nemo.lightning.megatron_parallel import MegatronParallel
+from nemo.lightning.pytorch.opt.base import LRSchedulerModule, OptimizerModule
+
+
+class MegatronOptimizerModule(OptimizerModule):
+    """A OptimizerModule for the megatron optimizers.
+
+    Attributes:
+        config (OptimizerConfig): Configuration for the optimizer.
+        no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
+        scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
+        lr_mult (float): Learning rate multiplier.
+
+    Example::
+
+        config = OptimizerConfig(...)
+        lr_scheduler = MyLRSchedulerModule(...)
+        optimizer_module = MegatronOptimizerModule(config, lr_scheduler)
+
+    Methods:
+        setup(model): Sets up the optimizer.
+        optimizers(model): Defines the optimizers.
+    """
+
+    def __init__(
+        self,
+        config: OptimizerConfig,
+        lr_scheduler: Optional[LRSchedulerModule] = None,
+        no_weight_decay_cond: Optional[Callable] = None,
+        scale_lr_cond: Optional[Callable] = None,
+        lr_mult: float = 1.0,
+    ):
+        """Initializes the MegatronOptimizerModule.
+
+        Args:
+            config (OptimizerConfig): Configuration for the optimizer.
+            lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
+            no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
+            scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
+            lr_mult (float): Learning rate multiplier.
+        """
+
+        super().__init__(lr_scheduler=lr_scheduler)
+        self.config = config
+        self.no_weight_decay_cond = no_weight_decay_cond
+        self.scale_lr_cond = scale_lr_cond
+        self.lr_mult = lr_mult
+
+    def setup(self, model):
+        """We will add the finalize_model_grads function to the model config.
+
+        Args:
+            model: The model for which the optimizer is being set up.
+        """
+
+        def finalize_model_grads_func(*args, **kwargs):
+            return self.finalize_model_grads(*args, **kwargs)
+
+        get_model_config(model[0]).finalize_model_grads_func = finalize_model_grads_func
+
+    def optimizers(self, model: MegatronParallel) -> List[Optimizer]:
+        """Defines the optimizers.
+
+        Args:
+            model (MegatronParallel): The model for which the optimizers are being defined.
+
+        Returns:
+            List[Optimizer]: The list of optimizers.
+
+        Raises:
+            ValueError: If the model is not an instance of MegatronParallel.
+        """
+
+        if not isinstance(model, MegatronParallel):
+            raise ValueError("Model must be an instance of MegatronParallel")
+
+        from nemo.core.optim import McoreDistributedOptimizer
+
+        mcore_opt = get_megatron_optimizer(
+            self.config,
+            list(model),
+            no_weight_decay_cond=self.no_weight_decay_cond,
+            scale_lr_cond=self.scale_lr_cond,
+            lr_mult=self.lr_mult,
+        )
+
+        return [McoreDistributedOptimizer(mcore_opt)]
+
+    def finalize_model_grads(self, *args, **kwargs):
+        return finalize_model_grads(*args, **kwargs)
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 7daef032376b..7aceda64de43 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -212,6 +212,7 @@ def setup_megatron_parallel(self, trainer: pl.Trainer) -> None:
             cpu=isinstance(trainer.accelerator, CPUAccelerator),
             ddp_config=self.ddp_config,
         )
+        self.megatron_parallel.trainer = trainer
 
         # check signature-def of self.model.configure_optimizers to check if there's an optional arg: megatron_parallel
         sig = inspect.signature(self.model.configure_optimizers)
@@ -232,16 +233,11 @@ def setup_megatron_parallel(self, trainer: pl.Trainer) -> None:
         _optimizers_to_device(self.optimizers, self.root_device)
 
         self.model = self.megatron_parallel
-        self.model.trainer = trainer
 
         if hasattr(self.precision_plugin, "convert_module"):
             self.model = self.precision_plugin.convert_module(self.model)
         self.model.callbacks.add(getattr(trainer, "callbacks"))
 
-        if hasattr(self, "optimizers") and self.optimizers:
-            for optimizer in self.optimizers:
-                self.model.callbacks.add(optimizer)
-
         if self.data_sampler:
             self.model.callbacks.add(self.data_sampler)
 

From 3c58ede560ff56744a8e86cf949e9395b4f3e52e Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Thu, 13 Jun 2024 12:34:40 -0400
Subject: [PATCH 21/25] fix minor import bug (#9463)

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 nemo/deploy/nlp/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py
index 52d5b3dbff3e..ae4db1ce6f2a 100644
--- a/nemo/deploy/nlp/__init__.py
+++ b/nemo/deploy/nlp/__init__.py
@@ -15,7 +15,7 @@
 
 use_query_llm = True
 try:
-    from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMTensorRT
+    from nemo.deploy.nlp.query_llm import NemoQueryLLM
 except Exception:
     use_query_llm = False
 

From d52f67367b20a1ea58ec76f18e2b723a15f71fbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Thu, 13 Jun 2024 20:49:30 +0200
Subject: [PATCH 22/25] ci(notifications): Fetch all jobs (#9465)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index fab97d71f47a..abac79310fdf 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4310,7 +4310,8 @@ jobs:
             }
           '
 
-          JOBS_URL="https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs"  
+          # We are close to reaching 100 jobs: Once we break that barrier, we have to iterate pages
+          JOBS_URL="https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100"  
           SUMMARY="[]"
           while IFS= read -r JOB; do
             JOB_NAME="$(echo $JOB | jq '.key' | tr -d '"') / main"

From a6a0aeec0da3fa345e608d333b03cebcdc136960 Mon Sep 17 00:00:00 2001
From: Guy Jacob <guyj@nvidia.com>
Date: Thu, 13 Jun 2024 22:04:02 +0300
Subject: [PATCH 23/25] Hyena Operator (#9264)

* Initial reference code commit, unchanged

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Hyena code changes for NeMO compatibility

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* MCore spec override functionality + example config w. hyena

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Additional changes - now working on char-level TinyShakespeare

* Add missing input LayerNorm to spec (in the default attention
  spec it's fused with the projection Linear layer, so not
  explicitly defined)
* Shape conversion at start and end of Hyena forward

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Add fftconv cuda impl from safari

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Workaround for shape error in fftconv

See: https://github.com/HazyResearch/safari/issues/26#issuecomment-1589018138
Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Explicitly convert kernel to FP32

(torch.fft doesn't support bf16)

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Working run configs

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Remove sharded_state_dict from HyenaOperator

(made redundant by the default inmplementation in Megatron)

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Update configs

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Testing TE Linear classes in HyenaOperator

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Revert to FusedDense for in/out projections after merging with 24.01.01

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Fix bug (use fused LNorm+Linear), bring back TE layers

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Configs rename + cleanup

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* FlashFFTConv, Multi-head, some cleanup

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Bug fix - init FlashFFTConv with 2*seq_len

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* ModuleSpec + replace nn.Conv1d with causal_conv1d

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Remove unneeded arguments

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* More cleanup, remove fftconv ref functions

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Refactor HyenaFilter + more cleanup

* Refactor in spirit of implementation in MAD-Lab repo:
  https://github.com/athms/mad-lab/blob/main/mad/model/layers/hyena.py

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Add missing attributions

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Remove fftconv sources

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Bug fixes

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Remove d_model from external API, take from TransformerConfig

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* cleanup config

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Remove spec override logic (possibly push separately)

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Add tests

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Keep only megatron_gpt_config_hyena (w. 153m parameters)

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Black + isort formatting changes

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Fixes following PR review

* Clearer names + more documentation for config params
* Clearer README
* Check seq len < 8K with safari-fftconv
* Avoid 0*bias op during forward

Signed-off-by: Guy Jacob <guyj@nvidia.com>

* Fix tests following param name changes

Signed-off-by: Guy Jacob <guyj@nvidia.com>

---------

Signed-off-by: Guy Jacob <guyj@nvidia.com>
---
 .../conf/megatron_gpt_config_hyena.yaml       | 277 +++++++++++++
 .../language_modeling/megatron_gpt_model.py   |   5 +-
 .../nlp/modules/common/hyena/README.md        |  26 ++
 .../nlp/modules/common/hyena/__init__.py      |   1 +
 .../modules/common/hyena/fftconv_wrapper.py   | 129 ++++++
 .../nlp/modules/common/hyena/hyena.py         | 381 ++++++++++++++++++
 .../nlp/modules/common/hyena/hyena_filter.py  | 173 ++++++++
 .../nlp/modules/common/hyena/hyena_spec.py    |  47 +++
 tests/collections/nlp/test_hyena_operator.py  | 179 ++++++++
 9 files changed, 1217 insertions(+), 1 deletion(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml
 create mode 100644 nemo/collections/nlp/modules/common/hyena/README.md
 create mode 100644 nemo/collections/nlp/modules/common/hyena/__init__.py
 create mode 100644 nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py
 create mode 100644 nemo/collections/nlp/modules/common/hyena/hyena.py
 create mode 100644 nemo/collections/nlp/modules/common/hyena/hyena_filter.py
 create mode 100644 nemo/collections/nlp/modules/common/hyena/hyena_spec.py
 create mode 100644 tests/collections/nlp/test_hyena_operator.py

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml
new file mode 100644
index 000000000000..30e0beb0d5e5
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml
@@ -0,0 +1,277 @@
+defaults:
+  - _self_
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg:
+
+name: megatron_gpt_hyena
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_gpt_hyena
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  # use GPTModel from megatron.core
+  mcore_gpt: True
+
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 16 # limited by GPU memory
+  global_batch_size: 256 # will use more micro batches to reach global batch size
+  rampup_batch_size: null # Should be a list of 3 values: [<start_batch_size>, <batch_size_increment>, <rampup_samples>]
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  # model architecture
+  encoder_seq_length: 2048
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 18
+  hidden_size: 864
+  ffn_hidden_size: 1728
+  num_attention_heads: 1
+  init_method_std: 0.023 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
+  ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number.
+  normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: True # Whether to use bias terms in all weight matrices.
+  activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: True # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
+  name: te_gpt_hyena # key for selecting the correct ModuleSpec
+
+  hyena:
+    # HyenaOperator parameters
+    max_seq_length: ${model.encoder_seq_length} # Maximum input sequence length.
+    order: 2 # Depth of the Hyena recurrence
+    num_heads: 1 # Number of heads (this is separate from model.num_attention_heads)
+    dropout: 0.0
+    short_filter_order: 3 # Length of the explicit input convolutional filter
+    activation: "identity" # type of act between kernel output and output projection
+
+    # HyenaConv parameters
+    precision: ${trainer.precision} # Training precision (required for FlashFFTConv initialization)
+    bias: true # Whether to apply a bias term following long convolution
+
+    # HyenaFilter parameters
+    emb_dim: 33 # dimension of the filter's internal positional encoding
+    learn_pos_emb_z: true # whether the positional embeddings are learned
+    mlp_width: 64 # Width of the MLP parametrizing the implicit filter
+    sine_freq: 14 # frequency of periodic activations
+    num_inner_mlps: 2 # number of inner linear layers inside filter MLP
+    normalized: False # whether to apply normalization after modulation
+
+    # ExponentialModulation parameters
+    modulate: True # Whether to apply exponential decay modulation
+    learn_modulation: False # Whether decay rates are learned
+    fast_decay_pct: 0.3
+    slow_decay_pct: 1.5
+    target: 1e-2
+    shift: 0.0
+
+  tokenizer:
+    library: 'megatron'
+    type: 'GPT2BPETokenizer'
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+
+  # Mixed precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+
+
+  # Miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+  ub_tp_comm_overlap: False
+  # Use userbuffer backend to overlap tensor-parallel communications with computes.
+  # This feature is only available with Transformer Engine and squence parallelism enabled and, currently, supports only GPT models.
+  ub_tp_comm_overlap_cfg: null
+  # A yaml file with userbuffer communicator configurations. This file should provide `method`, `dtype`, `num_sm`, `num_splits`,
+  # `cga_size`, `num_splits`, `set_sm_margin`, and `aggregate` for the communicators to use custom settings.
+  # If the configuration file is not provided a default setting is used for all communicators.
+
+  ## Flash Attention
+  use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True
+
+  data:
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below:
+    # data_prefix:
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    data_prefix: ???
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: "99990,8,2"
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 2
+    dataloader_type: single # cyclic
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+    exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: distributed_fused_adam
+    overlap_grad_sync: True
+    overlap_param_sync: False
+    contiguous_grad_buffer: True
+    lr: 6e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 636
+      constant_steps: 100000
+      min_lr: 2e-5
+
+  gc_interval: 0
+  # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector.
+  # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8cb8d95150c9..eb7d7b694e2f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -44,6 +44,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec import get_gpt_layer_modelopt_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.modules.common.hyena.hyena_spec import get_gpt_layer_with_te_and_hyena_spec
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.utils import (
@@ -143,7 +144,7 @@ def mcore_supports_moe() -> bool:
         return False
 
 
-def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True):
+def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True, hyena_cfg: Dict = None):
     if num_experts is not None:
         assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE"
 
@@ -155,6 +156,7 @@ def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True):
         "megatron_falcon_gpt": get_falcon_layer_spec(),
         "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(),
         "modelopt": get_gpt_layer_modelopt_spec(),
+        "te_gpt_hyena": get_gpt_layer_with_te_and_hyena_spec(hyena_cfg),
     }
     if spec_name not in name_spec_dict:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")
@@ -417,6 +419,7 @@ def model_provider_func(self, pre_process, post_process):
                     self.transformer_config.num_moe_experts,
                     self.transformer_config.moe_grouped_gemm,
                     self.transformer_engine,
+                    self.cfg.get('hyena', None),
                 ),
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
diff --git a/nemo/collections/nlp/modules/common/hyena/README.md b/nemo/collections/nlp/modules/common/hyena/README.md
new file mode 100644
index 000000000000..a5e7b32cc590
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/hyena/README.md
@@ -0,0 +1,26 @@
+## Required Dependencies for Hyena
+
+We depend on 3rd-party libraries for FFT convolutions implementation. Each library supports different use-cases:
+
+|     Library      | Supported Sequence Length | Single/Multi-Head Support |
+|:----------------:|:-------------------------:|:-------------------------:|
+| Safari `fftconv` |        Up to 8192         |       1 or 8 heads        |
+|   FlashFFTConv   |         Up to 4M          |     Single-head only      |
+
+Note the overlapping support for single-head with sequence length up to 8192. By default, in this case we default to Safari `fftconv` as it is faster (and fallback to FlashFFTConv). The user may force the FFT convolution implementation used by setting the configuration key `model.hyena.fftconv_type` to either `safari` or `flash`.
+
+### Installation
+
+#### Safari `fftconv`
+
+Install from the [Safari repository](https://github.com/HazyResearch/safari/tree/main/csrc/fftconv). Run the following in a terminal:
+
+```bash
+git clone https://github.com/HazyResearch/safari.git
+cd safari/csrc/fftconv
+pip install .
+```
+
+#### FlashFFTConv
+
+Follow the [installation instructions](https://github.com/HazyResearch/flash-fft-conv?tab=readme-ov-file#installation) in the FlashFFTConv repository.
diff --git a/nemo/collections/nlp/modules/common/hyena/__init__.py b/nemo/collections/nlp/modules/common/hyena/__init__.py
new file mode 100644
index 000000000000..f976e8f9d9c6
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/hyena/__init__.py
@@ -0,0 +1 @@
+from nemo.collections.nlp.modules.common.hyena.hyena import HyenaOperator
diff --git a/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py b/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py
new file mode 100644
index 000000000000..ca9a44489697
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py
@@ -0,0 +1,129 @@
+import math
+
+import torch
+from einops import rearrange
+from fftconv import fftconv_bwd, fftconv_fwd
+
+# Code taken from:
+# https://github.com/HazyResearch/safari/blob/main/src/ops/fftconv.py
+
+
+class FFTConvFunc(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        u,
+        k,
+        D,
+        dropout_mask=None,
+        gelu=True,
+        force_fp16_output=False,
+        output_hbl_layout=False,
+        v=None,
+        head_dim=1,
+        q=None,
+        fftfp16=False,
+        k_rev=None,
+    ):
+        seqlen = u.shape[-1]
+        fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16)
+        k_f = torch.fft.rfft(k, n=fft_size)
+        if k_rev is not None:
+            k_f = k_f + torch.fft.rfft(k_rev, n=fft_size).conj()
+        if u.stride(-1) != 1:
+            u = u.contiguous()
+        k_f = k_f.contiguous()
+        D = D.contiguous()
+        if v is not None and v.stride(-1) != 1:
+            v = v.contiguous()
+        if q is not None and q.stride(-1) != 1:
+            q = q.contiguous()
+        if dropout_mask is not None:
+            dropout_mask = dropout_mask.contiguous()
+        ctx.save_for_backward(u, k_f, D, dropout_mask, v, q)
+        ctx.output_hbl_layout = output_hbl_layout
+        ctx.head_dim = head_dim
+        ctx.gelu = gelu
+        ctx.fftfp16 = fftfp16
+        ctx.has_k_rev = k_rev is not None
+        out = fftconv_fwd(
+            u,
+            k_f,
+            D,
+            v,
+            head_dim,
+            q,
+            dropout_mask,
+            gelu,
+            False,
+            False,
+            fft_size,
+            force_fp16_output,
+            output_hbl_layout,
+            fftfp16,
+        )
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        if ctx.output_hbl_layout:
+            dout = rearrange(rearrange(dout, 'b h l -> h b l').contiguous(), 'h b l -> b h l')
+        else:
+            dout = dout.contiguous()
+        u, k_f, D, dropout_mask, v, q = ctx.saved_tensors
+        seqlen = u.shape[-1]
+        fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16)
+        du, dk_f, dD, dv, dq = fftconv_bwd(
+            dout,
+            u,
+            k_f,
+            D,
+            v,
+            ctx.head_dim,
+            q,
+            dropout_mask,
+            ctx.gelu,
+            False,
+            False,
+            fft_size,
+            ctx.output_hbl_layout,
+            ctx.fftfp16,
+        )
+        dk = torch.fft.irfft(dk_f, n=fft_size, norm='forward')[..., :seqlen]
+        dk_rev = None if not ctx.has_k_rev else torch.fft.irfft(dk_f.conj(), n=fft_size, norm='forward')[..., :seqlen]
+        if v is not None:
+            dv = dv.to(dtype=v.dtype)  # We do atomicAdd in fp32 so might need to convert to fp16
+        return (
+            du,
+            dk,
+            dD,
+            None,
+            None,
+            None,
+            None,
+            dv,
+            None,
+            dq,
+            None,
+            dk_rev,
+        )
+
+
+def fftconv_func(
+    u,
+    k,
+    D,
+    dropout_mask=None,
+    gelu=True,
+    force_fp16_output=False,
+    output_hbl_layout=False,
+    v=None,
+    head_dim=1,
+    q=None,
+    fftfp16=False,
+    k_rev=None,
+):
+    return FFTConvFunc.apply(
+        u, k, D, dropout_mask, gelu, force_fp16_output, output_hbl_layout, v, head_dim, q, fftfp16, k_rev
+    )
diff --git a/nemo/collections/nlp/modules/common/hyena/hyena.py b/nemo/collections/nlp/modules/common/hyena/hyena.py
new file mode 100644
index 000000000000..f087a3d7a244
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/hyena/hyena.py
@@ -0,0 +1,381 @@
+# Implementation of Hyena operator
+#
+# Michael Poli and Stefano Massaroli and Eric Nguyen and Daniel Y Fu and Tri Dao and Stephen Baccus and
+# Yoshua Bengio and Stefano Ermon and Christopher Re,
+# Hyena Hierarchy: Towards Larger Convolutional Language Models
+# 2023, https://arxiv.org/abs/2302.10866
+#
+# Multi-head variant introduced in:
+#
+# Stefano Massaroli and Michael Poli and Daniel Y Fu and Hermann Kumbong and Rom Nishijima Parnichkun and
+# David W. Romero and Aman Timalsina and Quinn McIntyre and Beidi Chen and Atri Rudra and Ce Zhang and
+# Christopher Re and Stefano Ermon and Yoshua Bengio,
+# Laughing Hyena Distillery: Extracting Compact Recurrences From Convolutions
+# NeurIPS 2023, https://arxiv.org/abs/2310.18780
+#
+# Code is heavily based on the reference implementations from:
+# https://github.com/HazyResearch/safari/blob/flashfftconv/src/models/sequence/hyena.py
+# https://github.com/athms/mad-lab/blob/main/mad/model/layers/hyena.py
+
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+from nemo.collections.common.parts.utils import activation_registry
+from nemo.collections.nlp.modules.common.hyena.hyena_filter import HyenaFilter, HyenaFilterSubmodules
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils.metaclasses import Singleton
+
+try:
+    from nemo.collections.nlp.modules.common.hyena.fftconv_wrapper import fftconv_func as safari_fftconv_fn
+
+    HAVE_SAFARI_FFTCONV = True
+except ImportError:
+    HAVE_SAFARI_FFTCONV = False
+
+try:
+    from flashfftconv import FlashFFTConv as FlashFFTConvImpl
+
+    HAVE_FLASHFFTCONV = True
+
+    class FlashFFTConv(metaclass=Singleton):
+        # Recommendation is to create single instance per model
+        # https://github.com/HazyResearch/flash-fft-conv?tab=readme-ov-file#example-model
+        def __init__(self, seqlen, dtype):
+            self.flashfftconv = FlashFFTConvImpl(seqlen, dtype)
+
+except ImportError:
+    HAVE_FLASHFFTCONV = False
+
+try:
+    from causal_conv1d import causal_conv1d_fn
+
+    HAVE_CAUSAL_CONV1D = True
+except ImportError:
+    HAVE_CAUSAL_CONV1D = False
+
+
+@dataclass
+class HyenaOperatorSubmodules:
+    in_proj: Union[ModuleSpec, type] = IdentityOp
+    short_filter: Union[ModuleSpec, type] = IdentityFuncOp
+    implicit_filter: Union[ModuleSpec, type] = IdentityOp
+    out_proj: Union[ModuleSpec, type] = IdentityOp
+
+
+def auto_assign_attrs(cls, **kwargs):
+    for k, v in kwargs.items():
+        setattr(cls, k, v)
+
+
+class CausalDepthWiseConv1d(nn.Module):
+    def __init__(self, channels, width, bias=True):
+        if not HAVE_CAUSAL_CONV1D:
+            raise ImportError("Missing causal-conv1d library, please run 'pip install causal-conv1d'")
+
+        super().__init__()
+        self.channels = channels
+        self.width = width
+        self._conv_1d = nn.Conv1d(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=width,
+            padding=width - 1,
+            groups=channels,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return causal_conv1d_fn(x, self._conv_1d.weight.squeeze(1), self._conv_1d.bias)
+
+
+class HyenaConv(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        max_seq_length: int,
+        order: int,
+        bias: bool = True,
+        filter_cls: Union[ModuleSpec, type] = HyenaFilter,
+        filter_submodules: HyenaFilterSubmodules = None,
+        **filter_kwargs,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.order = order
+        self.max_seq_length = max_seq_length
+        self.use_bias = bias
+        bias_shape = self.d_model * (self.order - 1)
+        if self.use_bias:
+            self.bias = nn.Parameter(torch.randn(bias_shape))
+        else:
+            self.bias = torch.zeros(bias_shape)
+
+        self.filter = build_module(
+            filter_cls,
+            self.d_model * (self.order - 1),
+            submodules=filter_submodules,
+            seq_len=max_seq_length,
+            **filter_kwargs,
+        )
+
+
+class SingleHeadHyenaConv(HyenaConv):
+    def __init__(
+        self,
+        d_model: int,
+        max_seq_length: int,
+        order: int,
+        bias: bool = True,
+        filter_cls: Union[ModuleSpec, type] = HyenaFilter,
+        filter_submodules: HyenaFilterSubmodules = None,
+        fftconv_type: str = None,
+        precision: str = 'bf16',
+        **filter_kwargs,
+    ):
+        super().__init__(
+            d_model,
+            max_seq_length,
+            order,
+            bias=bias,
+            filter_cls=filter_cls,
+            filter_submodules=filter_submodules,
+            **filter_kwargs,
+        )
+
+        if fftconv_type is None:
+            if max_seq_length <= 8192 and HAVE_SAFARI_FFTCONV:
+                # safari-fftconv supports seq-len <= 8192 and is a bit faster vs. flashfftconv
+                fftconv_type = 'safari'
+            else:
+                fftconv_type = 'flash'
+
+        if fftconv_type not in ['safari', 'flash']:
+            raise ValueError("fftconv_type must be one of ['safari', 'flash']")
+        if fftconv_type == 'safari' and max_seq_length > 8192:
+            raise ValueError('Safari-fftconv only supports sequence length up to 8192')
+        if fftconv_type == 'safari' and not HAVE_SAFARI_FFTCONV:
+            raise ImportError('Safari-fftconv library not found. Please see README at <tbd> for instructions.')
+        if fftconv_type == 'flash' and not HAVE_FLASHFFTCONV:
+            raise ImportError('flashfftconv library not found. Please see README at <tbd> for instructions.')
+
+        if fftconv_type == 'safari':
+            self.fftconv_fn = self._safari_fft
+        else:  # fftconv_type == 'flash'
+            self.flashfftconv = FlashFFTConv(
+                2 * self.max_seq_length, torch_dtype_from_precision(precision)
+            ).flashfftconv
+            self.fftconv_fn = self._flash_fft
+
+    def _safari_fft(self, x, k, bias):
+        bias = bias.to(dtype=torch.float32)
+        return safari_fftconv_fn(x, k, bias, gelu=False)
+
+    def _flash_fft(self, x, k, bias):
+        x = x.contiguous()
+        y = self.flashfftconv(x, k) + x * bias.unsqueeze(dim=1)
+        return y
+
+    def forward(self, x, k, recurrence_idx):
+        bias = rearrange(self.bias, '(v o) -> o v', v=self.d_model, o=self.order - 1)[recurrence_idx]
+        y = self.fftconv_fn(x, k, bias)
+        return y
+
+
+class MultiHeadHyenaConv(HyenaConv):
+    def __init__(
+        self,
+        d_model: int,
+        max_seq_length: int,
+        order: int,
+        num_heads: int,
+        bias: bool = True,
+        filter_cls: Union[ModuleSpec, type] = HyenaFilter,
+        filter_submodules: HyenaFilterSubmodules = None,
+        fftconv_type: str = None,
+        precision: str = 'bf16',
+        **filter_kwargs,
+    ):
+        if num_heads == 1:
+            raise ValueError('Expecting num_heads > 1')
+        if order != 2:
+            raise ValueError(f'Multi-head supported only with order == 2 (got order {self.order})')
+        if not HAVE_SAFARI_FFTCONV:
+            raise ImportError('Safari-fftconv library not found. Please see README at <tbd> for instructions.')
+
+        super().__init__(
+            d_model,
+            max_seq_length,
+            order,
+            bias=bias,
+            filter_cls=filter_cls,
+            filter_submodules=filter_submodules,
+            **filter_kwargs,
+        )
+        self.num_heads = num_heads
+
+    def forward(self, v, k, x1, x2):
+        bias = self.bias.to(dtype=torch.float32)
+        y = safari_fftconv_fn(v, k, bias, gelu=False, output_hbl_layout=True, v=x2, head_dim=self.num_heads, q=x1)
+        return y
+
+
+class HyenaOperator(nn.Module):
+    def __init__(
+        self,
+        config: TransformerConfig,
+        max_seq_length: int,
+        order: int = 2,
+        num_heads: int = 1,
+        dropout: float = 0.0,
+        short_filter_order: int = 3,
+        activation: str = "identity",
+        submodules: HyenaOperatorSubmodules = None,
+        layer_number=None,
+        **long_conv_kwargs,
+    ):
+        r"""
+        Hyena operator described in the paper https://arxiv.org/pdf/2302.10866.pdf
+
+        Args:
+            max_seq_length: (int): Maximum input sequence length.
+            order: (int): Depth of the Hyena recurrence. Defaults to 2
+            num_heads: (int): Number of heads. Defaults to 1
+            dropout: (float): Dropout probability. Defaults to 0.0
+            short_filter_order: (int): Length of the explicit input convolutional filter. Defaults to 3
+            activation: (str): type of act between kernel output and output projection (default identity)
+        """
+        super().__init__()
+
+        if submodules is None:
+            submodules = HyenaOperatorSubmodules(
+                in_proj=TELayerNormColumnParallelLinear,
+                short_filter=CausalDepthWiseConv1d,
+                implicit_filter=HyenaFilter,
+                out_proj=TERowParallelLinear,
+            )
+
+        if order < 2:
+            raise ValueError(f'Order must be at least 2, (got {self.order})')
+
+        d_model = config.hidden_size
+        if d_model % num_heads != 0:
+            raise ValueError(f'Model dimension {d_model} must be divisible by num heads {num_heads}')
+        head_dim = d_model // num_heads
+
+        auto_assign_attrs(
+            self,
+            d_model=d_model,
+            order=order,
+            max_seq_length=max_seq_length,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            short_filter_order=short_filter_order,
+            activation=activation,
+            mcore_config=config,
+        )
+        self.activation = activation_registry[activation]()
+        self.dropout = nn.Dropout(dropout)
+
+        # Setup input and output projections (over the width dimension)
+        self.in_proj = build_module(
+            submodules.in_proj,
+            self.d_model,
+            (self.order + 1) * self.d_model,
+            config=self.mcore_config,
+            init_method=self.mcore_config.init_method,
+            gather_output=False,
+            bias=True,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='in_proj',
+        )
+
+        self.out_proj = build_module(
+            submodules.out_proj,
+            self.d_model,
+            self.d_model,
+            config=self.mcore_config,
+            init_method=self.mcore_config.output_layer_init_method,
+            bias=True,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='out_proj',
+        )
+
+        # Setup short filter
+        total_width = self.d_model * (self.order + 1)
+        self.short_filter = build_module(submodules.short_filter, total_width, self.short_filter_order)
+
+        # Setup long convolution with implicit filter
+        long_conv_args = [self.head_dim, self.max_seq_length, self.order]
+        long_conv_kwargs['filter_cls'] = submodules.implicit_filter
+        long_conv_kwargs['filter_submodules'] = submodules.implicit_filter.submodules
+        if self.num_heads == 1:
+            self.long_conv = SingleHeadHyenaConv(*long_conv_args, **long_conv_kwargs)
+            self.conv_fwd_fn = self.conv_single_head
+        else:
+            long_conv_args.append(self.num_heads)
+            self.long_conv = MultiHeadHyenaConv(*long_conv_args, **long_conv_kwargs)
+            self.conv_fwd_fn = self.conv_multi_head
+
+    def forward(self, u, *args, **kwargs):
+        l = u.size(0)
+        l_filter = min(l, self.max_seq_length)
+        u = self.in_proj(u)
+        u = u[0] if isinstance(u, tuple) else u
+        u = rearrange(u, 'l b d -> b d l')  # In MCore the leading dimension is the sequence dimension
+
+        k = self.long_conv.filter(l_filter)
+        # `c` is always 1 by default
+        k = rearrange(k, 'c l v -> c v l', v=self.head_dim)[0]
+
+        uc = self.short_filter(u)[..., :l_filter]
+
+        k = k.to(dtype=torch.float32)
+        y = self.conv_fwd_fn(uc, k)
+
+        y = rearrange(y, 'b d l -> b l d')
+        y = self.activation(y)
+        y = self.out_proj(y)
+        if isinstance(y, tuple):
+            y, bias = y
+        else:
+            bias = None
+
+        # Convert back to sequence-first for MCore
+        y = rearrange(y, 'b l d -> l b d')
+
+        # MCore TransformerLayer expects tuple where 2nd element represents the bias, it can be None
+        return y, bias
+
+    def conv_single_head(self, uc, k):
+        k = rearrange(k, '(o v) l -> o v l', v=self.head_dim, o=self.order - 1)
+
+        *x, v = uc.split(self.d_model, dim=1)
+        for o, x_i in enumerate(reversed(x[1:])):
+            v = self.dropout(v * x_i)
+            v = self.long_conv(v, k=k[o], recurrence_idx=o)
+
+        y = v * x[0]
+        return y
+
+    def conv_multi_head(self, uc, k):
+        x1, x2, v = uc.split(self.d_model, dim=1)
+        x1 = x1.contiguous()
+        x2 = x2.contiguous()
+        v = v.contiguous()
+
+        y = self.long_conv(v, k, x1, x2)
+        return y
diff --git a/nemo/collections/nlp/modules/common/hyena/hyena_filter.py b/nemo/collections/nlp/modules/common/hyena/hyena_filter.py
new file mode 100644
index 000000000000..bf6752102480
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/hyena/hyena_filter.py
@@ -0,0 +1,173 @@
+import math
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+
+# Code mostly taken from:
+# https://github.com/HazyResearch/safari/blob/flashfftconv/src/models/sequence/hyena.py
+
+
+@dataclass
+class HyenaFilterSubmodules:
+    positional_embedding: Union[ModuleSpec, type] = IdentityOp
+    linear: Union[ModuleSpec, type] = IdentityOp
+    activation: Union[ModuleSpec, type] = IdentityOp
+    modulation: Union[ModuleSpec, type] = IdentityOp
+
+
+def register(module: nn.Module, name: str, tensor: torch.Tensor, learnable: bool):
+    if learnable:
+        module.register_parameter(name, nn.Parameter(tensor))
+    else:
+        module.register_buffer(name, tensor)
+
+
+class Sin(nn.Module):
+    def __init__(self, dim: int, freq: float = 10, train_freq: bool = True):
+        """
+        Sinusoidal activation function with (optionally learned) per-channel frequency
+        """
+        super().__init__()
+        self.freq = nn.Parameter(freq * torch.ones(1, dim)) if train_freq else freq * torch.ones(1, dim)
+
+    def forward(self, x):
+        return torch.sin(self.freq * x)
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        emb_dim: int,
+        seq_len: int,
+        learn_pos_emb_z: bool = True,
+    ):
+        """Complex exponential positional embeddings for Hyena filters."""
+        super().__init__()
+
+        self.seq_len = seq_len
+        # The time embedding fed to the filters is normalized so that t_f = 1
+        t = torch.linspace(0, 1, self.seq_len)[None, :, None]  # 1, L, 1
+
+        if emb_dim > 1:
+            bands = (emb_dim - 1) // 2
+        # To compute the right embeddings we use the "proper" linspace
+        t_rescaled = torch.linspace(0, seq_len - 1, seq_len)[None, :, None]
+        w = 2 * math.pi * t_rescaled / seq_len  # 1, L, 1
+
+        f = torch.linspace(1e-4, bands - 1, bands)[None, None]
+        z = torch.exp(-1j * f * w)
+        z = torch.cat([t, z.real, z.imag], dim=-1)
+        register(self, "z", z, learnable=learn_pos_emb_z)
+        register(self, "t", t, learnable=False)
+
+    def forward(self, L):
+        return self.z[:, :L], self.t[:, :L]
+
+
+class ExponentialModulation(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        modulate: bool = True,
+        learn_modulation: bool = False,
+        fast_decay_pct: float = 0.3,
+        slow_decay_pct: float = 1.5,
+        target: float = 1e-2,
+        shift: float = 0.0,
+    ):
+        """
+        Exponential decay modulation with (optionally learned) per-channel decay rate
+        """
+        super().__init__()
+        self.modulate = modulate
+        self.shift = shift
+        max_decay = math.log(target) / fast_decay_pct
+        min_decay = math.log(target) / slow_decay_pct
+        deltas = torch.linspace(min_decay, max_decay, d_model)[None, None]
+        register(self, "deltas", deltas, learnable=learn_modulation)
+
+    def forward(self, t, x):
+        if self.modulate:
+            decay = torch.exp(-t * self.deltas.abs())
+            x = x * (decay + self.shift)
+        return x
+
+
+class HyenaFilter(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        seq_len: int = 1024,
+        emb_dim: int = 3,
+        learn_pos_emb_z: bool = True,
+        mlp_width: int = 64,
+        sine_freq: int = 1,
+        num_inner_mlps: int = 2,
+        normalized: bool = False,
+        submodules: HyenaFilterSubmodules = None,
+        **modulation_kwargs,
+    ):
+        """
+        Implicit long filter with modulation.
+
+        Args:
+            d_model (int): number of channels in the input
+            emb_dim (int): dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands
+            mlp_width (int): Width of the MLP parametrizing the implicit filter. Defaults to 64
+            seq_len (int): length of input sequence
+            learn_pos_emb_z (bool): whether the positional embeddings are learned
+            sine_freq (int): frequency of periodic activations
+            num_inner_mlps (int): number of inner linear layers inside filter MLP
+            normalized (bool): whether to apply normalization after modulation
+        """
+        super().__init__()
+
+        if submodules is None:
+            submodules = HyenaFilterSubmodules(
+                positional_embedding=PositionalEmbedding,
+                linear=nn.Linear,
+                activation=Sin,
+                modulation=ExponentialModulation,
+            )
+
+        self.d_model = d_model
+        self.mlp_width = mlp_width
+
+        act = build_module(submodules.activation, dim=mlp_width, freq=sine_freq)
+        self.emb_dim = emb_dim
+        if emb_dim % 2 == 0 or emb_dim < 3:
+            raise ValueError("emb_dim must be odd and greater or equal to 3 (time, sine and cosine)")
+        self.seq_len = seq_len
+
+        self.pos_emb = build_module(submodules.positional_embedding, emb_dim, seq_len, learn_pos_emb_z)
+
+        # uses a variable number of inner linear layers
+        self.mlp = nn.Sequential(
+            build_module(submodules.linear, emb_dim, mlp_width),
+            act,
+        )
+        for i in range(num_inner_mlps):
+            self.mlp.append(build_module(submodules.linear, mlp_width, mlp_width))
+            self.mlp.append(act)
+        # final linear layer
+        self.mlp.append(build_module(submodules.linear, mlp_width, d_model, bias=False))
+
+        self.modulation = build_module(submodules.modulation, d_model, **modulation_kwargs)
+
+        self.normalized = normalized
+
+    def forward(self, L):
+        z, t = self.pos_emb(L)
+        h = self.mlp(z)
+
+        h = self.modulation(t, h)
+
+        if self.normalized:
+            h = h / torch.norm(h, dim=-1, p=1, keepdim=True)
+
+        return h
diff --git a/nemo/collections/nlp/modules/common/hyena/hyena_spec.py b/nemo/collections/nlp/modules/common/hyena/hyena_spec.py
new file mode 100644
index 000000000000..cd9fd66f4e75
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/hyena/hyena_spec.py
@@ -0,0 +1,47 @@
+import torch.nn as nn
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+
+from nemo.collections.nlp.modules.common.hyena.hyena import (
+    CausalDepthWiseConv1d,
+    HyenaOperator,
+    HyenaOperatorSubmodules,
+)
+from nemo.collections.nlp.modules.common.hyena.hyena_filter import (
+    ExponentialModulation,
+    HyenaFilter,
+    HyenaFilterSubmodules,
+    PositionalEmbedding,
+    Sin,
+)
+
+
+def get_hyena_layer_with_transformer_engine_spec(hyena_cfg):
+    return ModuleSpec(
+        module=HyenaOperator,
+        params=hyena_cfg,
+        submodules=HyenaOperatorSubmodules(
+            in_proj=TELayerNormColumnParallelLinear,
+            short_filter=CausalDepthWiseConv1d,
+            implicit_filter=ModuleSpec(
+                module=HyenaFilter,
+                submodules=HyenaFilterSubmodules(
+                    positional_embedding=PositionalEmbedding,
+                    linear=nn.Linear,
+                    activation=Sin,
+                    modulation=ExponentialModulation,
+                ),
+            ),
+            out_proj=TERowParallelLinear,
+        ),
+    )
+
+
+def get_gpt_layer_with_te_and_hyena_spec(hyena_cfg):
+    spec = get_gpt_layer_with_transformer_engine_spec()
+    spec.submodules.self_attention = get_hyena_layer_with_transformer_engine_spec(hyena_cfg)
+    return spec
diff --git a/tests/collections/nlp/test_hyena_operator.py b/tests/collections/nlp/test_hyena_operator.py
new file mode 100644
index 000000000000..d6ebaa2f335d
--- /dev/null
+++ b/tests/collections/nlp/test_hyena_operator.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch.nn
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+from nemo.collections.nlp.modules.common.hyena.hyena import HyenaOperator, MultiHeadHyenaConv, SingleHeadHyenaConv
+from nemo.collections.nlp.modules.common.hyena.hyena_spec import get_hyena_layer_with_transformer_engine_spec
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+
+try:
+    import fftconv
+
+    HAVE_FFTCONV = True
+except ImportError:
+    HAVE_FFTCONV = False
+
+try:
+    import flashfftconv
+
+    HAVE_FLASHFFTCONV = True
+except ImportError:
+    HAVE_FLASHFFTCONV = False
+
+try:
+    import causal_conv1d
+
+    HAVE_CAUSAL_CONV1D = True
+except ImportError:
+    HAVE_CAUSAL_CONV1D = False
+
+
+@pytest.fixture()
+def transformer_config():
+    cfg = TransformerConfig(num_layers=2, hidden_size=864, num_attention_heads=1)
+    return cfg
+
+
+@pytest.fixture()
+def hyena_config():
+    cfg = {
+        # HyenaOperator parameters
+        'max_seq_length': 1024,
+        'order': 2,
+        'num_heads': 1,
+        'dropout': 0.0,
+        'short_filter_order': 3,
+        'activation': "identity",
+        # HyenaConv parameters
+        'precision': 'bf16',
+        'bias': True,
+        'fftconv_type': None,
+        # HyenaFilter parameters
+        'emb_dim': 33,
+        'learn_pos_emb_z': True,
+        'mlp_width': 64,
+        'sine_freq': 1,
+        'num_inner_mlps': 2,
+        'normalized': False,
+        # ExponentialModulation parameters
+        'modulate': True,
+        'learn_modulation': False,
+        'fast_decay_pct': 0.3,
+        'slow_decay_pct': 1.5,
+        'target': 1e-2,
+        'shift': 0.0,
+    }
+    return cfg
+
+
+@pytest.fixture()
+def submodules(hyena_config):
+    return get_hyena_layer_with_transformer_engine_spec(hyena_config).submodules
+
+
+@pytest.mark.run_only_on('GPU')
+@pytest.mark.skipif(not HAVE_CAUSAL_CONV1D, reason='causal-conv-1d not installed')
+class TestHyenaOperator:
+    @pytest.mark.skipif(not HAVE_FFTCONV, reason='Safari fftconv not installed')
+    @pytest.mark.parametrize(
+        "optionals_enabled, num_heads, expected_num_weights",
+        [(False, 1, 3068256), (True, 1, 3102912), (True, 8, 3053016)],
+    )
+    def test_parameters(
+        self, optionals_enabled, num_heads, expected_num_weights, transformer_config, hyena_config, submodules
+    ):
+        # Expected num weights calculation:
+        #
+        # Denote: inner_width = d_model * (order + 1)
+        #         head_dim = d_model / num_heads
+        #
+        # in_proj (layer_norm) --> d_model * 2
+        # in_proj (linear) --> d_model * inner_width + inner_width
+        # out_proj (linear) --> d_model * d_model + d_model
+        # short_filter (depthwise-separable 1d conv) --> inner_width * short_filter_order + inner_width
+        # long_conv bias --> head_dim
+        # filter:
+        #   pos_emb.z --> max_seq_len * emb_dim
+        #   sin activation freqs --> mlp_width
+        #   mlp:
+        #     input layer -->  emb_dim * mlp_width + mlp_width
+        #     inner layers --> num_inner_mlps * (mlp_width ^ 2 + mlp_width)
+        #     output_layer (no bias) --> mlp_width * head_dim
+        #   modulation: head_dim
+
+        hyena_config['fftconv_type'] = 'safari'
+
+        hyena_config['learn_pos_emb_z'] = optionals_enabled
+        hyena_config['learn_modulation'] = optionals_enabled
+        hyena_config['num_heads'] = num_heads
+        hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config)
+
+        assert hyena_module.d_model == transformer_config.hidden_size
+        assert isinstance(hyena_module.long_conv.filter.pos_emb.z, torch.nn.Parameter) == optionals_enabled
+        assert isinstance(hyena_module.long_conv.filter.modulation.deltas, torch.nn.Parameter) == optionals_enabled
+
+        num_weights = sum([p.numel() for p in hyena_module.parameters()])
+        assert num_weights == expected_num_weights
+
+    @staticmethod
+    def check_gpu_forward(hyena_module, transformer_config, hyena_config):
+        dtype = torch_dtype_from_precision(hyena_config['precision'])
+        hyena_module = hyena_module.to(device='cuda', dtype=dtype)
+
+        bs = 4
+        seq_len = hyena_config['max_seq_length']
+        d_model = transformer_config.hidden_size
+
+        x = torch.randn(seq_len, bs, d_model)
+        x = x.to(device='cuda', dtype=dtype)
+
+        y, _ = hyena_module(x)
+        assert y.shape[0] == seq_len
+        assert y.shape[1] == bs
+        assert y.shape[2] == d_model
+
+    @pytest.mark.skipif(not HAVE_FFTCONV, reason='Safari fftconv not installed')
+    def test_single_head_safari(self, transformer_config, hyena_config, submodules):
+        hyena_config['fftconv_type'] = 'safari'
+        hyena_config['num_heads'] = 1
+        hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config)
+
+        assert isinstance(hyena_module.long_conv, SingleHeadHyenaConv)
+        assert hyena_module.long_conv.fftconv_fn == hyena_module.long_conv._safari_fft
+
+        self.check_gpu_forward(hyena_module, transformer_config, hyena_config)
+
+    @pytest.mark.skipif(not HAVE_FLASHFFTCONV, reason='Safari fftconv not installed')
+    def test_single_head_flash(self, transformer_config, hyena_config, submodules):
+        hyena_config['fftconv_type'] = 'flash'
+        hyena_config['num_heads'] = 1
+        hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config)
+
+        assert isinstance(hyena_module.long_conv, SingleHeadHyenaConv)
+        assert hyena_module.long_conv.fftconv_fn == hyena_module.long_conv._flash_fft
+
+        self.check_gpu_forward(hyena_module, transformer_config, hyena_config)
+
+    @pytest.mark.skipif(not HAVE_FFTCONV, reason='Safari fftconv not installed')
+    def test_multi_head(self, transformer_config, hyena_config, submodules):
+        hyena_config['fftconv_type'] = 'safari'
+        hyena_config['num_heads'] = 8
+        hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config)
+
+        assert isinstance(hyena_module.long_conv, MultiHeadHyenaConv)
+
+        self.check_gpu_forward(hyena_module, transformer_config, hyena_config)

From f47209bd2220966159ae1c482332ede88ecb8072 Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Thu, 13 Jun 2024 15:25:37 -0400
Subject: [PATCH 24/25] Update build_dataset.py (#9467)

* Update build_dataset.py

fix bug during eval

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update build_dataset.py

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update build_dataset.py

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Update build_dataset.py

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>

---------

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>
Co-authored-by: stevehuang52 <stevehuang52@users.noreply.github.com>
---
 .../multimodal/speech_llm/data/build_dataset.py            | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/multimodal/speech_llm/data/build_dataset.py b/nemo/collections/multimodal/speech_llm/data/build_dataset.py
index b042386cea3b..698a01836169 100644
--- a/nemo/collections/multimodal/speech_llm/data/build_dataset.py
+++ b/nemo/collections/multimodal/speech_llm/data/build_dataset.py
@@ -207,6 +207,11 @@ def build_speechllm_dataloader(dataset, data_cfg, consumed_samples=0, is_predict
         )
         return dataloader
 
+    pad_to_global_batch = not data_cfg.drop_last
+    if is_eval:
+        # don't pad to global batch if in eval mode, unless explicitly set by user (e.g., eval with DDP)
+        pad_to_global_batch = (not data_cfg.drop_last) and data_cfg.get("pad_samples_to_global_batch_size", False)
+
     batch_sampler = MegatronPretrainingBatchSampler(
         total_samples=len(dataset),
         consumed_samples=consumed_samples,
@@ -215,7 +220,7 @@ def build_speechllm_dataloader(dataset, data_cfg, consumed_samples=0, is_predict
         data_parallel_rank=parallel_state.get_data_parallel_rank(),
         data_parallel_size=parallel_state.get_data_parallel_world_size(),
         drop_last=data_cfg.drop_last,
-        pad_samples_to_global_batch_size=not data_cfg.drop_last,
+        pad_samples_to_global_batch_size=pad_to_global_batch,
     )
 
     dataloader = torch.utils.data.DataLoader(

From 67bc8461e17aaa88652acd1588589067f1882d07 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Thu, 13 Jun 2024 14:42:27 -0700
Subject: [PATCH 25/25] Fix logging message (#9469)

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/modules/audio_preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
index 2dca468fab35..33143364ede1 100644
--- a/nemo/collections/asr/modules/audio_preprocessing.py
+++ b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -100,7 +100,7 @@ def __init__(self, win_length, hop_length):
     @torch.no_grad()
     def forward(self, input_signal, length):
         if input_signal.dtype != torch.float32:
-            logging.warn(
+            logging.warning(
                 f"AudioPreprocessor received an input signal of dtype {input_signal.dtype}, rather than torch.float32. In sweeps across multiple datasets, we have found that the preprocessor is not robust to low precision  mathematics. As such, it runs in float32. Your input will be cast to float32, but this is not necessarily enough to recovery full accuracy. For example, simply casting input_signal from torch.float32 to torch.bfloat16, then back to torch.float32 before running AudioPreprocessor causes drops in absolute WER of up to 0.1%. torch.bfloat16 simply does not have enough mantissa bits to represent enough values in the range [-1.0,+1.0] correctly.",
                 mode=logging_mode.ONCE,
             )