diff --git a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py b/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
index 23111f441101..f504cf93b087 100644
--- a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
+++ b/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
@@ -38,15 +38,15 @@
         
 Then, run this conversion script:
 python convert_nemo_gpt_to_mcore.py \
- --in-file <path to extracted, TP1 PP1 legacy checkpoint folder> \
- --out-file <path to output nemo ile>
+ --in-folder <path to extracted, TP1 PP1 legacy checkpoint folder> \
+ --out-file <path to output nemo file>
 """
 
 
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--in-file", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
+        "--in-folder", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
     )
     parser.add_argument(
         "--out-file", type=str, default=None, required=True, help="Path to output mcore weights file (ends in .nemo)."
@@ -57,6 +57,11 @@ def get_args():
         help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
         "but this option makes the conversion script significantly slower.",
     )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Run conversion again and overwrite output file when the output file already exists",
+    )
     args = parser.parse_args()
     return args
 
@@ -77,7 +82,7 @@ def get_mcore_model_from_nemo_file(nemo_restore_from_path, cpu_only=False):
         app_state.nemo_file_folder = nemo_restore_from_path
     else:
         logging.warning(
-            "`nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail."
+            "⚠️ `nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail."
         )
     mcore_model = MegatronGPTModel(model_cfg, trainer=trainer)
     return mcore_model
@@ -163,22 +168,36 @@ def load_model(model, state_dict):
 
 def restore_model(nemo_file, cpu_only=False):
     dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
-    if cpu_only:
-        map_location = torch.device('cpu')
-        model_config = MegatronGPTModel.restore_from(
-            nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location
+    map_location = torch.device('cpu') if cpu_only else None
+    model_config = MegatronGPTModel.restore_from(
+        nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location
+    )
+    model_config.use_cpu_initialization = cpu_only
+
+    # To copy weights in the original precision, we have to turn on O2.
+    orig_megatron_amp_O2_value = model_config.megatron_amp_O2
+    if model_config.target.endswith("MegatronGPTSFTModel"):
+        logging.warning(
+            "⚠️ Model target is `MegatronGPTSFTModel` which may not work with this conversion script. "
+            "This is a known issue. For now, please modify the config yaml file to use `MegatronGPTModel`."
         )
-        model_config.use_cpu_initialization = True
-    else:
-        model_config, map_location = None, None
-    return MegatronGPTModel.restore_from(
+
+    if model_config.precision in ['bf16', 'bf16-mixed']:
+        model_config.megatron_amp_O2 = True
+
+    model = MegatronGPTModel.restore_from(
         nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
     )
 
+    # restore O2 to the original value so mcore model has the same config
+    model.cfg.megatron_amp_O2 = orig_megatron_amp_O2_value
+    return model
+
 
 def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=False):
     if skip_if_output_exists and os.path.exists(output_nemo_file):
         logging.info(f"Output file already exists ({output_nemo_file}), skipping conversion...")
+        logging.info("If you want to overwrite the output file, please run with --overwrite flag")
         return
     nemo_model = restore_model(input_nemo_file, cpu_only=cpu_only)
 
@@ -193,6 +212,8 @@ def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_o
                 mcore_state_dict[mcore_param] = torch.cat(
                     [nemo_state_dict[nemo_param], nemo_state_dict[second_param]], dim=0
                 )
+            else:
+                mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param]
         else:
             mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param]
 
@@ -205,7 +226,9 @@ def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_o
 
     mcore_model.cfg.use_cpu_initialization = False
     mcore_model.save_to(output_nemo_file)
-    logging.info(f"Done. Model saved to {output_nemo_file}")
+    logging.info(f"✅ Done. Model saved to {output_nemo_file}")
+    del mcore_model
+    del nemo_model
 
 
 def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):
@@ -239,7 +262,8 @@ def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):
                 # linear_fc1.weight should map to concat(dense_h_to_4h.weight, dense_h_to_4h_2.weight)
                 # but build_key_mapping only maps it to dense_h_to_4h.weight, so we handle the concat here.
                 second_param = nemo_param.replace("dense_h_to_4h.weight", "dense_h_to_4h_2.weight")
-                nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)])
+                if second_param in nemo_state_dict:
+                    nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)])
             assert torch.allclose(mcore_weight, nemo_weight), f"❌ parameter {mcore_param} does not match"
         except KeyError:
             buffers = [k for k, v in mcore_model.named_buffers()]
@@ -261,11 +285,21 @@ def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):
 if __name__ == '__main__':
     args = get_args()
 
-    input_nemo_file = args.in_file
+    input_nemo_file = args.in_folder
     output_nemo_file = args.out_file
     cpu_only = args.cpu_only
+    overwrite = args.overwrite
 
     os.makedirs(os.path.dirname(output_nemo_file), exist_ok=True)
-    convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=cpu_only)
+    try:
+        convert(input_nemo_file, output_nemo_file, skip_if_output_exists=not overwrite, cpu_only=cpu_only)
+    except torch.cuda.OutOfMemoryError:
+        logging.error("Could not convert due to torch.cuda.OutOfMemoryError.")
+        logging.error("Please run the script with --cpu-only flag")
+        exit(1)
     torch.cuda.empty_cache()
-    run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only)
+    try:
+        run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only)
+    except torch.cuda.OutOfMemoryError:
+        logging.info("✅ Conversion was successful, but could not run sanity check due to torch.cuda.OutOfMemoryError.")
+        logging.info("Please run the script with the same command again to run sanity check.")