Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix mcore conversion bug #7846

Merged
merged 10 commits into from
Nov 7, 2023
68 changes: 51 additions & 17 deletions scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@

Then, run this conversion script:
python convert_nemo_gpt_to_mcore.py \
--in-file <path to extracted, TP1 PP1 legacy checkpoint folder> \
--out-file <path to output nemo ile>
--in-folder <path to extracted, TP1 PP1 legacy checkpoint folder> \
--out-file <path to output nemo file>
"""


def get_args():
parser = ArgumentParser()
parser.add_argument(
"--in-file", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
"--in-folder", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.",
)
parser.add_argument(
"--out-file", type=str, default=None, required=True, help="Path to output mcore weights file (ends in .nemo)."
Expand All @@ -57,6 +57,11 @@
help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
"but this option makes the conversion script significantly slower.",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Run conversion again and overwrite output file when the output file already exists",
)
args = parser.parse_args()
return args

Expand All @@ -77,7 +82,7 @@
app_state.nemo_file_folder = nemo_restore_from_path
else:
logging.warning(
"`nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail."
"⚠️ `nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail."
)
mcore_model = MegatronGPTModel(model_cfg, trainer=trainer)
return mcore_model
Expand Down Expand Up @@ -163,22 +168,36 @@

def restore_model(nemo_file, cpu_only=False):
dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy())
if cpu_only:
map_location = torch.device('cpu')
model_config = MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location
map_location = torch.device('cpu') if cpu_only else None
model_config = MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location
)
model_config.use_cpu_initialization = cpu_only

# To copy weights in the original precision, we have to turn on O2.
orig_megatron_amp_O2_value = model_config.megatron_amp_O2
if model_config.target.endswith("MegatronGPTSFTModel"):
logging.warning(
"⚠️ Model target is `MegatronGPTSFTModel` which may not work with this conversion script. "
"This is a known issue. For now, please modify the config yaml file to use `MegatronGPTModel`."
)
model_config.use_cpu_initialization = True
else:
model_config, map_location = None, None
return MegatronGPTModel.restore_from(

if model_config.precision in ['bf16', 'bf16-mixed']:
model_config.megatron_amp_O2 = True

model = MegatronGPTModel.restore_from(
nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
)

# restore O2 to the original value so mcore model has the same config
model.cfg.megatron_amp_O2 = orig_megatron_amp_O2_value
return model


def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=False):
if skip_if_output_exists and os.path.exists(output_nemo_file):
logging.info(f"Output file already exists ({output_nemo_file}), skipping conversion...")
logging.info("If you want to overwrite the output file, please run with --overwrite flag")
return
nemo_model = restore_model(input_nemo_file, cpu_only=cpu_only)

Expand All @@ -193,6 +212,8 @@
mcore_state_dict[mcore_param] = torch.cat(
[nemo_state_dict[nemo_param], nemo_state_dict[second_param]], dim=0
)
else:
mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param]
else:
mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param]

Expand All @@ -205,7 +226,9 @@

mcore_model.cfg.use_cpu_initialization = False
mcore_model.save_to(output_nemo_file)
logging.info(f"Done. Model saved to {output_nemo_file}")
logging.info(f"✅ Done. Model saved to {output_nemo_file}")
del mcore_model
del nemo_model
Dismissed Show dismissed Hide dismissed


def run_sanity_checks(nemo_file, mcore_file, cpu_only=False):
Expand Down Expand Up @@ -239,7 +262,8 @@
# linear_fc1.weight should map to concat(dense_h_to_4h.weight, dense_h_to_4h_2.weight)
# but build_key_mapping only maps it to dense_h_to_4h.weight, so we handle the concat here.
second_param = nemo_param.replace("dense_h_to_4h.weight", "dense_h_to_4h_2.weight")
nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)])
if second_param in nemo_state_dict:
nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)])
assert torch.allclose(mcore_weight, nemo_weight), f"❌ parameter {mcore_param} does not match"
except KeyError:
buffers = [k for k, v in mcore_model.named_buffers()]
Expand All @@ -261,11 +285,21 @@
if __name__ == '__main__':
args = get_args()

input_nemo_file = args.in_file
input_nemo_file = args.in_folder
output_nemo_file = args.out_file
cpu_only = args.cpu_only
overwrite = args.overwrite

os.makedirs(os.path.dirname(output_nemo_file), exist_ok=True)
convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=cpu_only)
try:
convert(input_nemo_file, output_nemo_file, skip_if_output_exists=not overwrite, cpu_only=cpu_only)
except torch.cuda.OutOfMemoryError:
logging.error("Could not convert due to torch.cuda.OutOfMemoryError.")
logging.error("Please run the script with --cpu-only flag")
exit(1)
torch.cuda.empty_cache()
run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only)
try:
run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only)
except torch.cuda.OutOfMemoryError:
logging.info("✅ Conversion was successful, but could not run sanity check due to torch.cuda.OutOfMemoryError.")
logging.info("Please run the script with the same command again to run sanity check.")
Loading