Skip to content

Commit

Permalink
more test updates
Browse files Browse the repository at this point in the history
Signed-off-by: Ananth Subramaniam <[email protected]>
  • Loading branch information
ananthsub committed Dec 16, 2024
1 parent aae096b commit d4ee8de
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 4 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2895,7 +2895,7 @@ jobs:
with:
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand Down Expand Up @@ -2923,6 +2923,7 @@ jobs:
+model.tp_comm_overlap_ag=False \
+model.tp_comm_overlap_rs=False \
+model.tp_comm_overlap_disable_qkv=True \
+model.attention_backend="fused" \
model.peft.peft_scheme="lora" \
model.peft.lora_tuning.adapter_dim=16 \
model.peft.lora_tuning.alpha=32 \
Expand Down Expand Up @@ -4204,7 +4205,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
python3 tests/collections/llm/megatron_mixtral_pretraining.py \
--experiment-dir=/tmp/mixtral_pretrain_results \
--data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
Expand Down
1 change: 0 additions & 1 deletion tests/collections/llm/hf/peft_nemorun.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut
"NCCL_NVLS_ENABLE": "0",
"NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
"NVTE_ASYNC_AMAX_REDUCTION": "1",
"NVTE_FUSED_ATTN": "0",
}

executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
Expand Down
1 change: 0 additions & 1 deletion tests/collections/llm/hf/sft_nemorun.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut
"NCCL_NVLS_ENABLE": "0",
"NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
"NVTE_ASYNC_AMAX_REDUCTION": "1",
"NVTE_FUSED_ATTN": "0",
}

executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
Expand Down
2 changes: 2 additions & 0 deletions tests/collections/llm/megatron_mixtral_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import torch
from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig
from megatron.core.transformer.enums import AttnBackend

from nemo.collections.llm import MixtralConfig8x3B, MixtralModel, PreTrainingDataModule
from nemo.collections.llm.api import train
Expand Down Expand Up @@ -117,6 +118,7 @@ def main(args):
bf16=True,
params_dtype=torch.bfloat16,
pipeline_dtype=torch.bfloat16,
attention_backend=AttnBackend.unfused,
)
mixtral_config.overlap_param_gather_with_optimizer_step = True

Expand Down

0 comments on commit d4ee8de

Please sign in to comment.