NVIDIA · ericharper · Jan 24, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 23, 2024
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -202,6 +202,12 @@ model:
   ## Flash Attention
   use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True
 
+  ##Offloading Activations/Weights to CPU
+  cpu_offloading: False
+  cpu_offloading_num_layers: 11 #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer
+  cpu_offloading_activations: True
+  cpu_offloading_weights: True
+
   ## Network
   sharp: False # Enable the use of SHARP for NCCL data-parallel communications. This is going to be ignored if the network doesn't support SHARP.