Skip to content

Commit

Permalink
Fix mcore conversion bug (NVIDIA#7846)
Browse files Browse the repository at this point in the history
* fix bug for gpt

* add more user friendly messages for OOM

* rename in-file to in-folder and add overwrite option

* change print to logging

* fix issue where BF16+O1 nemo ckpt converted to FP32 mcore ckpt

Signed-off-by: Chen Cui <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add warning for MegatronGPTSFTModel

Signed-off-by: Chen Cui <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
cuichenx and pre-commit-ci[bot] authored Nov 7, 2023
1 parent 5766c7b commit 51182a4
Showing 1 changed file with 51 additions and 17 deletions.
68 changes: 51 additions & 17 deletions scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@
Then, run this conversion script:
python convert_nemo_gpt_to_mcore.py \
--in-file <path to extracted, TP1 PP1 legacy checkpoint folder> \
--out-file <path to output nemo ile>
--in-folder <path to extracted, TP1 PP1 legacy checkpoint folder> \
--out-file <path to output nemo file>
"""


def get_args():
parser = ArgumentParser()
parser.add_argument(
"--in-file", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
"--in-folder", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
)
parser.add_argument(
"--out-file", type=str, default=None, required=True, help="Path to output mcore weights file (ends in .nemo)."
Expand All @@ -57,6 +57,11 @@ def get_args():
help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
"but this option makes the conversion script significantly slower.",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Run conversion again and overwrite output file when the output file already exists",
)
args = parser.parse_args()
return args

Expand All @@ -77,7 +82,7 @@ def get_mcore_model_from_nemo_file(nemo_restore_from_path, cpu_only=False):
app_state.nemo_file_folder = nemo_restore_from_path
else:
logging.warning(
"`nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail."
"⚠️ `nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail."
)
mcore_model = MegatronGPTModel(model_cfg, trainer=trainer)
return mcore_model
Expand Down Expand Up @@ -163,22 +168,36 @@ def load_model(model, state_dict):

def restore_model(nemo_file, cpu_only=False):
dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
if cpu_only:
map_location = torch.device('cpu')
model_config = MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location
map_location = torch.device('cpu') if cpu_only else None
model_config = MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location
)
model_config.use_cpu_initialization = cpu_only

# To copy weights in the original precision, we have to turn on O2.
orig_megatron_amp_O2_value = model_config.megatron_amp_O2
if model_config.target.endswith("MegatronGPTSFTModel"):
logging.warning(
"⚠️ Model target is `MegatronGPTSFTModel` which may not work with this conversion script. "
"This is a known issue. For now, please modify the config yaml file to use `MegatronGPTModel`."
)
model_config.use_cpu_initialization = True
else:
model_config, map_location = None, None
return MegatronGPTModel.restore_from(

if model_config.precision in ['bf16', 'bf16-mixed']:
model_config.megatron_amp_O2 = True

model = MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
)

# restore O2 to the original value so mcore model has the same config
model.cfg.megatron_amp_O2 = orig_megatron_amp_O2_value
return model


def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=False):
if skip_if_output_exists and os.path.exists(output_nemo_file):
logging.info(f"Output file already exists ({output_nemo_file}), skipping conversion...")
logging.info("If you want to overwrite the output file, please run with --overwrite flag")
return
nemo_model = restore_model(input_nemo_file, cpu_only=cpu_only)

Expand All @@ -193,6 +212,8 @@ def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_o
mcore_state_dict[mcore_param] = torch.cat(
[nemo_state_dict[nemo_param], nemo_state_dict[second_param]], dim=0
)
else:
mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param]
else:
mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param]

Expand All @@ -205,7 +226,9 @@ def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_o

mcore_model.cfg.use_cpu_initialization = False
mcore_model.save_to(output_nemo_file)
logging.info(f"Done. Model saved to {output_nemo_file}")
logging.info(f"✅ Done. Model saved to {output_nemo_file}")
del mcore_model
del nemo_model


def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):
Expand Down Expand Up @@ -239,7 +262,8 @@ def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):
# linear_fc1.weight should map to concat(dense_h_to_4h.weight, dense_h_to_4h_2.weight)
# but build_key_mapping only maps it to dense_h_to_4h.weight, so we handle the concat here.
second_param = nemo_param.replace("dense_h_to_4h.weight", "dense_h_to_4h_2.weight")
nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)])
if second_param in nemo_state_dict:
nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)])
assert torch.allclose(mcore_weight, nemo_weight), f"❌ parameter {mcore_param} does not match"
except KeyError:
buffers = [k for k, v in mcore_model.named_buffers()]
Expand All @@ -261,11 +285,21 @@ def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):
if __name__ == '__main__':
args = get_args()

input_nemo_file = args.in_file
input_nemo_file = args.in_folder
output_nemo_file = args.out_file
cpu_only = args.cpu_only
overwrite = args.overwrite

os.makedirs(os.path.dirname(output_nemo_file), exist_ok=True)
convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=cpu_only)
try:
convert(input_nemo_file, output_nemo_file, skip_if_output_exists=not overwrite, cpu_only=cpu_only)
except torch.cuda.OutOfMemoryError:
logging.error("Could not convert due to torch.cuda.OutOfMemoryError.")
logging.error("Please run the script with --cpu-only flag")
exit(1)
torch.cuda.empty_cache()
run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only)
try:
run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only)
except torch.cuda.OutOfMemoryError:
logging.info("✅ Conversion was successful, but could not run sanity check due to torch.cuda.OutOfMemoryError.")
logging.info("Please run the script with the same command again to run sanity check.")

0 comments on commit 51182a4

Please sign in to comment.