diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 9bd50e14806d..63d2297838c3 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -202,6 +202,12 @@ model: ## Flash Attention use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True + ##Offloading Activations/Weights to CPU + cpu_offloading: False + cpu_offloading_num_layers: 11 #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer + cpu_offloading_activations: True + cpu_offloading_weights: True + ## Network sharp: False # Enable the use of SHARP for NCCL data-parallel communications. This is going to be ignored if the network doesn't support SHARP.