Skip to content

Commit

Permalink
Merge branch 'main' into rohitrango/controlnet_cibug
Browse files Browse the repository at this point in the history
  • Loading branch information
Victor49152 authored Jul 30, 2024
2 parents 1bd4a73 + c29d91a commit e145e01
Show file tree
Hide file tree
Showing 177 changed files with 26,302 additions and 1,454 deletions.
2 changes: 2 additions & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.github/ @pablo-garay @ko3n1g
Dockerfile.ci @pablo-garay @ko3n1g
154 changes: 107 additions & 47 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -256,26 +256,29 @@ jobs:
quantization.num_calib_size=8 \
inference.batch_size=2 \
export.inference_tensor_parallel=2 \
export.sample_output=False \
export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
L2_PTQ_Llama2_INT8_SQ:
OPTIONAL_L2_PTQ_Llama2_INT8_SQ:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
TIMEOUT: 15
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_ptq.py \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int8_sq \
quantization.num_calib_size=8 \
inference.batch_size=2 \
export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int8_sq \
quantization.num_calib_size=8 \
inference.batch_size=2 \
export.sample_output=False \
export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
IS_OPTIONAL: true

# TODO: investigate int4_awq stuck issues and restore the test
#L2_PTQ_Llama2_INT4_AWQ:
Expand Down Expand Up @@ -310,44 +313,42 @@ jobs:
#- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
# if: "failure()"

L2_QAT_Llama2_INT4:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
quantization.algorithm=int4 \
quantization.num_calib_size=8 \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_steps=4 \
trainer.val_check_interval=4 \
+trainer.limit_val_batches=2 \
exp_manager.explicit_log_dir=llama2_qat_results \
model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.global_batch_size=2 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
rm -rf llama2_qat_results
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
# OPTIONAL_L2_QAT_Llama2_INT4:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
# - name: Checkout repository
# uses: actions/checkout@v4
# - run: |
# python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
# quantization.algorithm=int4 \
# quantization.num_calib_size=8 \
# trainer.devices=1 \
# trainer.num_nodes=1 \
# trainer.max_steps=4 \
# trainer.val_check_interval=4 \
# +trainer.limit_val_batches=2 \
# exp_manager.explicit_log_dir=llama2_qat_results \
# model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
# model.tensor_model_parallel_size=1 \
# model.pipeline_model_parallel_size=1 \
# model.global_batch_size=2 \
# model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
# model.data.train_ds.concat_sampling_probabilities=[1.0] \
# model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]

# rm -rf llama2_qat_results

# L2: ASR dev run
ASR_dev_run_Speech_to_Text:
Expand Down Expand Up @@ -810,7 +811,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
# L2: Segmentation Tool
L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
Expand Down Expand Up @@ -3305,6 +3306,62 @@ jobs:
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/lora_tuning_tp2
L2_Megatron_GPT_PEFT_Lora_TP2SP1:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1
CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
trainer.max_steps=3 \
trainer.val_check_interval=3 \
++trainer.limit_val_batches=2 \
trainer.precision=bf16 \
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2_sp1 \
+model.mcore_gpt=True \
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.sequence_parallel=True \
model.megatron_amp_O2=True \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+model.fp8=True \
+model.fp8_params=True \
+model.fp8_hybrid=True \
+model.fp8_e4m3=False \
+model.fp8_interval=1 \
+model.fp8_margin=0 \
+model.fp8_amax_history_len=32 \
+model.fp8_amax_compute_algo=max \
+model.reduce_amax=False \
+model.ub_tp_comm_overlap=False \
+model.tp_comm_overlap_ag=False \
+model.tp_comm_overlap_rs=False \
+model.tp_comm_overlap_disable_qkv=True \
model.peft.peft_scheme='lora' \
model.peft.lora_tuning.adapter_dim=16 \
model.peft.lora_tuning.alpha=32 \
model.peft.lora_tuning.column_init_method="kaiming" \
+model.peft.lora_tuning.dropout_position='pre' \
model.peft.lora_tuning.target_modules=['attention'] \
model.peft.lora_tuning.adapter_dropout=0.1 \
+model.peft.lora_tuning.a2a_experimental=1 \
model.answer_only_loss=True \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1
L2_Megatron_GPT_Eval:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -3859,7 +3916,7 @@ jobs:
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.pipeline_model_parallel_split_rank=1 \
model.pipeline_model_parallel_split_rank=0 \
model.seq_length=256 \
model.encoder.num_layers=4 \
model.decoder.num_layers=1 \
Expand Down Expand Up @@ -4560,6 +4617,8 @@ jobs:

Nemo_CICD_Test:
needs:
- gpu-test
- cicd-test-container-setup
- L0_Unit_Tests_GPU
- L0_Unit_Tests_CPU
- L2_Community_LLM_Checkpoints_tests_Llama
Expand Down Expand Up @@ -4630,6 +4689,7 @@ jobs:
- L2_Megatron_GPT_Embedding
- L2_Megatron_GPT_PEFT_Lora_PP2_O2
- L2_Megatron_GPT_PEFT_Lora_TP2_O1
- L2_Megatron_GPT_PEFT_Lora_TP2SP1
- L2_Megatron_GPT_Eval
- L2_Megatron_GPT_Eval_PP2
- L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/config/changelog-config.json
Original file line number Diff line number Diff line change
@@ -1,47 +1,47 @@
{
"categories": [
{
"title": "## ASR \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "## ASR\n\n<details><summary>Changelog</summary>",
"labels": ["asr"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## TTS \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## TTS\n\n<details><summary>Changelog</summary>",
"labels": ["tts"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## NLP / NMT \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## NLP / NMT\n\n<details><summary>Changelog</summary>",
"labels": ["nlp", "nmt", "megatron"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## Text Normalization / Inverse Text Normalization \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## Text Normalization / Inverse Text Normalization\n\n<details><summary>Changelog</summary>",
"labels": ["tn", "itn"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## NeMo Tools \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## NeMo Tools\n\n<details><summary>Changelog</summary>",
"labels": ["tools"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## Export \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## Export\n\n<details><summary>Changelog</summary>",
"labels": ["export"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## Documentation \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## Documentation\n\n<details><summary>Changelog</summary>",
"labels": ["docs"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## Bugfixes \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## Bugfixes\n\n<details><summary>Changelog</summary>",
"labels": ["bug"],
"exclude_labels": ["cherry-pick"]
},
{
"title": "## Cherrypick \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"title": "</details>\n\n## Cherrypick\n\n<details><summary>Changelog</summary>",
"labels": ["cherry-pick"],
"exclude_labels": ["cherry-pick"]
}
Expand All @@ -50,7 +50,7 @@
"ignore"
],
"sort": "ASC",
"template": "\n${{CHANGELOG}}\nUncategorized:\n${{UNCATEGORIZED}}\n\n",
"template": "\n${{CHANGELOG}}</details>\n\n## Uncategorized:\n\n<details><summary>Changelog</summary>\n\n${{UNCATEGORIZED}}\n</details>\n",
"pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
"empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
"label_extractor": [
Expand Down
59 changes: 59 additions & 0 deletions .github/workflows/mcore-tag-bump-bot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Regularly updates the CI container
name: MCore Tag Bump Bot
on:
workflow_dispatch:
schedule:
- cron: 0 0 * * *

jobs:
main:
runs-on: ubuntu-latest
environment: main
steps:
- name: Checkout NVIDIA/Megatron-LM
uses: actions/checkout@v4
with:
repository: NVIDIA/Megatron-LM
ref: main
path: ${{ github.run_id }}

- name: Get latest mcore commit
id: ref
run: |
cd ${{ github.run_id }}
sha=$(git rev-parse origin/main)
echo "sha=${sha}" >> "$GITHUB_OUTPUT"
echo "short_sha=${sha:0:7}" >> "$GITHUB_OUTPUT"
echo "date=$(date +%F)" >> "$GITHUB_OUTPUT"
- name: Checkout ${{ github.repository }}
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
token: ${{ secrets.PAT }}

- name: Bump MCORE_TAG
run: |
cd ${{ github.run_id }}
sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ steps.ref.outputs.sha }}/' Dockerfile.ci
- name: Create Bump PR
uses: peter-evans/create-pull-request@v6
id: create-pull-request
with:
path: ${{ github.run_id }}
branch: bump-ci-container-${{ steps.ref.outputs.date }}
base: main
title: 'Bump `Dockerfile.ci` (${{ steps.ref.outputs.date }})'
token: ${{ secrets.PAT }}
body: |
🚀 PR to Bump `Dockerfile.ci`.
📝 Please remember the following to-do's before merge:
- [ ] Verify the presubmit CI
🙏 Please merge this PR only if the CI workflow completed successfully.
commit-message: "[🤠]: Howdy folks, let's bump `Dockerfile.ci` to ${{ steps.ref.outputs.short_sha }} !"
signoff: true
reviewers: 'pablo-garay'
labels: 'Run CICD'
Loading

0 comments on commit e145e01

Please sign in to comment.