From c37d2a7f4b0138a1c38d7ab33488e40e8170c7ab Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Tue, 9 Jan 2024 23:19:18 -0800 Subject: [PATCH 1/3] Added sample cpu_offloading switch to YAML Signed-off-by: Selvaraj Anandaraj --- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 32cab48a68c8..0253bfc52f5f 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -194,6 +194,12 @@ model: ## Flash Attention use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True + ##Offloading Activations/Weights to CPU + cpu_offloading: False + cpu_offloading_num_layers: 1 + cpu_offloading_activations: True + cpu_offloading_weights: True + ## Network sharp: False # Enable the use of SHARP for NCCL data-parallel communications. This is going to be ignored if the network doesn't support SHARP. From 8ddfeeb349a7c8eb2912da67062f33a28b5b428d Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Tue, 9 Jan 2024 23:39:56 -0800 Subject: [PATCH 2/3] Added comments Signed-off-by: Selvaraj Anandaraj --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 0253bfc52f5f..bbfe07539b83 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -196,7 +196,7 @@ model: ##Offloading Activations/Weights to CPU cpu_offloading: False - cpu_offloading_num_layers: 1 + cpu_offloading_num_layers: ${subtract:${num_layers},1} #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer cpu_offloading_activations: True cpu_offloading_weights: True From 3065ddfe4e727e27f2c6d24b9f7a18d8b313f62c Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Tue, 23 Jan 2024 13:26:04 -0800 Subject: [PATCH 3/3] Removed arithmetic op Signed-off-by: Selvaraj Anandaraj --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index bbfe07539b83..f5ef0eaf27ff 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -196,7 +196,7 @@ model: ##Offloading Activations/Weights to CPU cpu_offloading: False - cpu_offloading_num_layers: ${subtract:${num_layers},1} #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer + cpu_offloading_num_layers: 11 #This value should be between [1,num_layers-1] as we don't want to offload the final layer's activations and expose any offloading duration for the final layer cpu_offloading_activations: True cpu_offloading_weights: True