Merge branch 'main' into rohitrango/controlnet_cibug

rohitrango · Jul 30, 2024 · e145e01 · e145e01
2 parents 1bd4a73 + c29d91a
commit e145e01
Show file tree

Hide file tree

Showing 177 changed files with 26,302 additions and 1,454 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,2 @@
+.github/ @pablo-garay @ko3n1g
+Dockerfile.ci @pablo-garay @ko3n1g
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -256,26 +256,29 @@ jobs:
           quantization.num_calib_size=8 \
           inference.batch_size=2 \
           export.inference_tensor_parallel=2 \
+          export.sample_output=False \
           export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
 
-  L2_PTQ_Llama2_INT8_SQ:
+  OPTIONAL_L2_PTQ_Llama2_INT8_SQ:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     with:
       RUNNER: self-hosted-azure
       TIMEOUT: 15
       SCRIPT: |
         python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-        quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-        quantization.algorithm=int8_sq \
-        quantization.num_calib_size=8 \
-        inference.batch_size=2 \
-        export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
+          quantization.algorithm=int8_sq \
+          quantization.num_calib_size=8 \
+          inference.batch_size=2 \
+          export.sample_output=False \
+          export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+      IS_OPTIONAL: true
 
   # TODO: investigate int4_awq stuck issues and restore the test
   #L2_PTQ_Llama2_INT4_AWQ:
@@ -310,44 +313,42 @@ jobs:
         #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
         #  if: "failure()"
 
-  L2_QAT_Llama2_INT4:
-     needs: [cicd-test-container-setup]
-     runs-on: self-hosted-azure
-     timeout-minutes: 10
-     container:
-       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-       options:
-         # --user 0:128
-         --device=/dev/nvidia0
-         --gpus all
-         --shm-size=8g
-         --env TRANSFORMERS_OFFLINE=0
-         --env HYDRA_FULL_ERROR=1
-         --volume /mnt/datadrive/TestData:/home/TestData
-     steps:
-         - name: Checkout repository
-           uses: actions/checkout@v4
-         - run: |
-            python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
-            quantization.algorithm=int4 \
-            quantization.num_calib_size=8 \
-            trainer.devices=1 \
-            trainer.num_nodes=1 \
-            trainer.max_steps=4 \
-            trainer.val_check_interval=4 \
-            +trainer.limit_val_batches=2 \
-            exp_manager.explicit_log_dir=llama2_qat_results \
-            model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-            model.tensor_model_parallel_size=1 \
-            model.pipeline_model_parallel_size=1 \
-            model.global_batch_size=2 \
-            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-            model.data.train_ds.concat_sampling_probabilities=[1.0] \
-            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
-
-            rm -rf llama2_qat_results
-         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-           if: "failure()"
+  # OPTIONAL_L2_QAT_Llama2_INT4:
+  #    needs: [cicd-test-container-setup]
+  #    runs-on: self-hosted-azure
+  #    timeout-minutes: 10
+  #    container:
+  #      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+  #      options:
+  #        # --user 0:128
+  #        --device=/dev/nvidia0
+  #        --gpus all
+  #        --shm-size=8g
+  #        --env TRANSFORMERS_OFFLINE=0
+  #        --env HYDRA_FULL_ERROR=1
+  #        --volume /mnt/datadrive/TestData:/home/TestData
+  #    steps:
+  #        - name: Checkout repository
+  #          uses: actions/checkout@v4
+  #        - run: |
+  #           python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
+  #           quantization.algorithm=int4 \
+  #           quantization.num_calib_size=8 \
+  #           trainer.devices=1 \
+  #           trainer.num_nodes=1 \
+  #           trainer.max_steps=4 \
+  #           trainer.val_check_interval=4 \
+  #           +trainer.limit_val_batches=2 \
+  #           exp_manager.explicit_log_dir=llama2_qat_results \
+  #           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #           model.tensor_model_parallel_size=1 \
+  #           model.pipeline_model_parallel_size=1 \
+  #           model.global_batch_size=2 \
+  #           model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+  #           model.data.train_ds.concat_sampling_probabilities=[1.0] \
+  #           model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
+
+  #           rm -rf llama2_qat_results
 
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
@@ -810,7 +811,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
+        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
@@ -3305,6 +3306,62 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/lora_tuning_tp2
 
+  L2_Megatron_GPT_PEFT_Lora_TP2SP1:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-2-h100
+      SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1
+
+        CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=bf16 \
+        exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2_sp1 \
+        +model.mcore_gpt=True \
+        model.pipeline_model_parallel_size=1 \
+        model.tensor_model_parallel_size=2 \
+        model.sequence_parallel=True \
+        model.megatron_amp_O2=True \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+        +model.fp8=True \
+        +model.fp8_params=True \
+        +model.fp8_hybrid=True \
+        +model.fp8_e4m3=False \
+        +model.fp8_interval=1 \
+        +model.fp8_margin=0 \
+        +model.fp8_amax_history_len=32 \
+        +model.fp8_amax_compute_algo=max \
+        +model.reduce_amax=False \
+        +model.ub_tp_comm_overlap=False \
+        +model.tp_comm_overlap_ag=False \
+        +model.tp_comm_overlap_rs=False \
+        +model.tp_comm_overlap_disable_qkv=True \
+        model.peft.peft_scheme='lora' \
+        model.peft.lora_tuning.adapter_dim=16 \
+        model.peft.lora_tuning.alpha=32 \
+        model.peft.lora_tuning.column_init_method="kaiming" \
+        +model.peft.lora_tuning.dropout_position='pre' \
+        model.peft.lora_tuning.target_modules=['attention'] \
+        model.peft.lora_tuning.adapter_dropout=0.1 \
+        +model.peft.lora_tuning.a2a_experimental=1 \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]
+      AFTER_SCRIPT: |
+        rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1        
+
   L2_Megatron_GPT_Eval:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -3859,7 +3916,7 @@ jobs:
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
         exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_split_rank=1 \
+        model.pipeline_model_parallel_split_rank=0 \
         model.seq_length=256 \
         model.encoder.num_layers=4 \
         model.decoder.num_layers=1 \
@@ -4560,6 +4617,8 @@ jobs:
 
   Nemo_CICD_Test:
     needs: 
+      - gpu-test
+      - cicd-test-container-setup
       - L0_Unit_Tests_GPU
       - L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Llama
@@ -4630,6 +4689,7 @@ jobs:
       - L2_Megatron_GPT_Embedding 
       - L2_Megatron_GPT_PEFT_Lora_PP2_O2
       - L2_Megatron_GPT_PEFT_Lora_TP2_O1
+      - L2_Megatron_GPT_PEFT_Lora_TP2SP1
       - L2_Megatron_GPT_Eval
       - L2_Megatron_GPT_Eval_PP2
       - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len

diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json
@@ -1,47 +1,47 @@
 {
     "categories": [
       {
-        "title": "## ASR \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "## ASR\n\n<details><summary>Changelog</summary>",
         "labels": ["asr"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## TTS \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## TTS\n\n<details><summary>Changelog</summary>",
         "labels": ["tts"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## NLP / NMT \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## NLP / NMT\n\n<details><summary>Changelog</summary>",
         "labels": ["nlp", "nmt", "megatron"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## Text Normalization / Inverse Text Normalization \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## Text Normalization / Inverse Text Normalization\n\n<details><summary>Changelog</summary>",
         "labels": ["tn", "itn"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## NeMo Tools \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## NeMo Tools\n\n<details><summary>Changelog</summary>",
         "labels": ["tools"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## Export \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## Export\n\n<details><summary>Changelog</summary>",
         "labels": ["export"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## Documentation \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## Documentation\n\n<details><summary>Changelog</summary>",
         "labels": ["docs"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## Bugfixes \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## Bugfixes\n\n<details><summary>Changelog</summary>",
         "labels": ["bug"],
         "exclude_labels": ["cherry-pick"]
       },
       {
-        "title": "## Cherrypick \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
+        "title": "</details>\n\n## Cherrypick\n\n<details><summary>Changelog</summary>",
         "labels": ["cherry-pick"],
         "exclude_labels": ["cherry-pick"]
       }
@@ -50,7 +50,7 @@
       "ignore"
     ],
     "sort": "ASC",
-    "template": "\n${{CHANGELOG}}\nUncategorized:\n${{UNCATEGORIZED}}\n\n",
+    "template": "\n${{CHANGELOG}}</details>\n\n## Uncategorized:\n\n<details><summary>Changelog</summary>\n\n${{UNCATEGORIZED}}\n</details>\n",
     "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
     "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
     "label_extractor": [

diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml
@@ -0,0 +1,59 @@
+# Regularly updates the CI container
+name: MCore Tag Bump Bot
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: 0 0 * * *
+
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    environment: main
+    steps:
+      - name: Checkout NVIDIA/Megatron-LM
+        uses: actions/checkout@v4
+        with:
+          repository: NVIDIA/Megatron-LM
+          ref: main
+          path: ${{ github.run_id }}
+
+      - name: Get latest mcore commit
+        id: ref
+        run: |
+          cd ${{ github.run_id }}      
+          sha=$(git rev-parse origin/main)
+          echo "sha=${sha}" >> "$GITHUB_OUTPUT"
+          echo "short_sha=${sha:0:7}" >> "$GITHUB_OUTPUT"
+          echo "date=$(date +%F)" >> "$GITHUB_OUTPUT"
+
+      - name: Checkout ${{ github.repository }}
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+          token: ${{ secrets.PAT }}
+
+      - name: Bump MCORE_TAG
+        run: |
+          cd ${{ github.run_id }}     
+          sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ steps.ref.outputs.sha }}/' Dockerfile.ci
+
+      - name: Create Bump PR
+        uses: peter-evans/create-pull-request@v6
+        id: create-pull-request
+        with:
+          path: ${{ github.run_id }}
+          branch: bump-ci-container-${{ steps.ref.outputs.date }}
+          base: main
+          title: 'Bump `Dockerfile.ci` (${{ steps.ref.outputs.date }})'
+          token: ${{ secrets.PAT }}
+          body: |
+            🚀 PR to Bump `Dockerfile.ci`.  
+
+            📝 Please remember the following to-do's before merge: 
+            - [ ] Verify the presubmit CI  
+
+            🙏 Please merge this PR only if the CI workflow completed successfully.
+          commit-message: "[🤠]: Howdy folks, let's bump `Dockerfile.ci` to ${{ steps.ref.outputs.short_sha }} !"
+          signoff: true
+          reviewers: 'pablo-garay'
+          labels: 'Run CICD'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		.github/ @pablo-garay @ko3n1g
		Dockerfile.ci @pablo-garay @ko3n1g