forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add nemo to mcore GPT conversion script (NVIDIA#7730)
* add conversion script Signed-off-by: Chen Cui <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove references to 'ckpt' Signed-off-by: Chen Cui <[email protected]> * add one more sanity check to make sure there is no unexpected keys in state dict Signed-off-by: Chen Cui <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * make cpu loading work Signed-off-by: Chen Cui <[email protected]> * make script work for llama2 models Signed-off-by: Chen Cui <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * address code check Signed-off-by: Chen Cui <[email protected]> * remove trainer precision (was for old sanity check) Signed-off-by: Chen Cui <[email protected]> * fix script for llama2 model Signed-off-by: Chen Cui <[email protected]> * remove commented code Signed-off-by: Chen Cui <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Chen Cui <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper <[email protected]>
- Loading branch information
1 parent
504097c
commit 214dcca
Showing
1 changed file
with
271 additions
and
0 deletions.
There are no files selected for viewing
271 changes: 271 additions & 0 deletions
271
scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,271 @@ | ||
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
from argparse import ArgumentParser | ||
from collections import OrderedDict | ||
|
||
import torch | ||
from omegaconf import OmegaConf, open_dict | ||
from pytorch_lightning.trainer.trainer import Trainer | ||
|
||
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel | ||
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy | ||
from nemo.utils import AppState, logging | ||
|
||
r""" | ||
Script to convert a legacy (non-mcore path) nemo checkpoint into mcore-path checkpoint for GPT models. | ||
*Important* Before running this script, please first | ||
1) convert your legacy checkpoint to TP1 PP1 format: | ||
python examples/nlp/language_modeling/megatron_change_num_partitions.py \ | ||
<follow the readme in that script> \ | ||
--target_tensor_model_parallel_size=1 \ | ||
--target_pipeline_model_parallel_size=1 | ||
2) extract your nemo file to a folder with | ||
tar -xvf filename.nemo | ||
Then, run this conversion script: | ||
python convert_nemo_gpt_to_mcore.py \ | ||
--in-file <path to extracted, TP1 PP1 legacy checkpoint folder> \ | ||
--out-file <path to output nemo ile> | ||
""" | ||
|
||
|
||
def get_args(): | ||
parser = ArgumentParser() | ||
parser.add_argument( | ||
"--in-file", type=str, default=None, required=True, help="Path to extracted, TP1 PP1 NeMo GPT checkpoint.", | ||
) | ||
parser.add_argument( | ||
"--out-file", type=str, default=None, required=True, help="Path to output mcore weights file (ends in .nemo)." | ||
) | ||
parser.add_argument( | ||
"--cpu-only", | ||
action="store_true", | ||
help="Load model in cpu only. Useful if the model cannot fit in GPU memory, " | ||
"but this option makes the conversion script significantly slower.", | ||
) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def get_mcore_model_from_nemo_file(nemo_restore_from_path, cpu_only=False): | ||
model_cfg = MegatronGPTModel.restore_from(nemo_restore_from_path, return_config=True) | ||
model_cfg.tokenizer.vocab_file = None | ||
model_cfg.tokenizer.merge_file = None | ||
model_cfg.mcore_gpt = True | ||
model_cfg.use_cpu_initialization = cpu_only | ||
|
||
logging.info("*** initializing mcore model with the following config") | ||
logging.info(OmegaConf.to_yaml(model_cfg)) | ||
trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy()) | ||
|
||
app_state = AppState() | ||
if os.path.isdir(nemo_restore_from_path): | ||
app_state.nemo_file_folder = nemo_restore_from_path | ||
else: | ||
logging.warning( | ||
"`nemo_file_folder` is NOT set because checkpoint is not pre-extracted. Subsequent operations may fail." | ||
) | ||
mcore_model = MegatronGPTModel(model_cfg, trainer=trainer) | ||
return mcore_model | ||
|
||
|
||
def print_mcore_parameter_names(restore_from_path): | ||
mcore_model = get_mcore_model_from_nemo_file(restore_from_path) | ||
|
||
print("*********") | ||
print('\n'.join(sorted([k + '###' + str(v.shape) for k, v in mcore_model.named_parameters()]))) | ||
print("*********") | ||
|
||
|
||
def build_key_mapping(nemo_cfg): | ||
num_layers = nemo_cfg.num_layers | ||
has_bias = nemo_cfg.get("bias", True) | ||
has_layernorm_bias = ( | ||
nemo_cfg.get("normalization", "layernorm") != "rmsnorm" | ||
) # llama model uses rmsnorm which does not have bias | ||
model_str = 'model.module' if nemo_cfg.get('megatron_amp_O2', False) else 'model' | ||
|
||
# For GPT there is a 1:1 mapping of keys | ||
mcore_to_nemo_mapping = { | ||
f"{model_str}.embedding.word_embeddings.weight": "model.language_model.embedding.word_embeddings.weight", | ||
f"{model_str}.decoder.final_layernorm.weight": "model.language_model.encoder.final_layernorm.weight", | ||
} | ||
if has_layernorm_bias: | ||
mcore_to_nemo_mapping[ | ||
f"{model_str}.decoder.final_layernorm.bias" | ||
] = "model.language_model.encoder.final_layernorm.bias" | ||
|
||
if not nemo_cfg.get("share_embeddings_and_output_weights", True): | ||
mcore_to_nemo_mapping[f"{model_str}.output_layer.weight"] = "model.language_model.output_layer.weight" | ||
|
||
if nemo_cfg.get("position_embedding_type", 'learned_absolute') == 'rope': | ||
mcore_to_nemo_mapping[f"{model_str}.rotary_pos_emb.inv_freq"] = "model.language_model.rotary_pos_emb.inv_freq" | ||
else: | ||
mcore_to_nemo_mapping[ | ||
f"{model_str}.embedding.position_embeddings.weight" | ||
] = "model.language_model.embedding.position_embeddings.weight" | ||
|
||
nemo_prefix = "model.language_model.encoder.layers" | ||
mcore_prefix = f"{model_str}.decoder.layers" | ||
for i in range(num_layers): | ||
for wb in ('weight', 'bias') if has_bias else ('weight',): | ||
mcore_to_nemo_mapping.update( | ||
{ | ||
f"{mcore_prefix}.{i}.mlp.linear_fc2.{wb}": f"{nemo_prefix}.{i}.mlp.dense_4h_to_h.{wb}", | ||
f"{mcore_prefix}.{i}.mlp.linear_fc1.{wb}": f"{nemo_prefix}.{i}.mlp.dense_h_to_4h.{wb}", | ||
f"{mcore_prefix}.{i}.self_attention.linear_proj.{wb}": f"{nemo_prefix}.{i}.self_attention.dense.{wb}", | ||
f"{mcore_prefix}.{i}.self_attention.linear_qkv.{wb}": f"{nemo_prefix}.{i}.self_attention.query_key_value.{wb}", | ||
} | ||
) | ||
# layernorm layers always have bias, but llama model uses rmsnorm which does not have bias | ||
for wb in ('weight', 'bias') if has_layernorm_bias else ('weight',): | ||
mcore_to_nemo_mapping.update( | ||
{ | ||
f"{mcore_prefix}.{i}.self_attention.linear_qkv.layer_norm_{wb}": f"{nemo_prefix}.{i}.input_layernorm.{wb}", | ||
f"{mcore_prefix}.{i}.mlp.linear_fc1.layer_norm_{wb}": f"{nemo_prefix}.{i}.post_attention_layernorm.{wb}", | ||
} | ||
) | ||
|
||
return mcore_to_nemo_mapping | ||
|
||
|
||
def load_model(model, state_dict): | ||
# try: | ||
for name, module in model.named_parameters(): | ||
if name in state_dict: | ||
module.data = state_dict.pop(name) | ||
else: | ||
raise RuntimeError(f"Unexpected key: {name} not in state_dict but in model.") | ||
|
||
for name, buffer in model.named_buffers(): | ||
if name in state_dict: | ||
buffer.data = state_dict.pop(name) | ||
|
||
if len(state_dict.keys()) != 0: | ||
raise RuntimeError(f"Additional keys: {state_dict.keys()} in state_dict but not in model.") | ||
|
||
return model | ||
|
||
|
||
def restore_model(nemo_file, cpu_only=False): | ||
dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy()) | ||
if cpu_only: | ||
map_location = torch.device('cpu') | ||
model_config = MegatronGPTModel.restore_from( | ||
nemo_file, trainer=dummy_trainer, return_config=True, map_location=map_location | ||
) | ||
model_config.use_cpu_initialization = True | ||
else: | ||
model_config, map_location = None, None | ||
return MegatronGPTModel.restore_from( | ||
nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location | ||
) | ||
|
||
|
||
def convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=False): | ||
if skip_if_output_exists and os.path.exists(output_nemo_file): | ||
logging.info(f"Output file already exists ({output_nemo_file}), skipping conversion...") | ||
return | ||
nemo_model = restore_model(input_nemo_file, cpu_only=cpu_only) | ||
|
||
nemo_tokenizer_model = nemo_model.cfg.tokenizer.model | ||
nemo_state_dict = nemo_model.state_dict() | ||
mcore_state_dict = OrderedDict() | ||
for mcore_param, nemo_param in build_key_mapping(nemo_model.cfg).items(): | ||
if mcore_param.endswith("linear_fc1.weight"): | ||
# in llama models, need to concat dense_h_to_4h.weight and dense_h_to_4h_2.weight for the corresponding linear_fc1.weight | ||
second_param = nemo_param.replace("dense_h_to_4h.weight", "dense_h_to_4h_2.weight") | ||
if second_param in nemo_state_dict: | ||
mcore_state_dict[mcore_param] = torch.cat( | ||
[nemo_state_dict[nemo_param], nemo_state_dict[second_param]], dim=0 | ||
) | ||
else: | ||
mcore_state_dict[mcore_param] = nemo_state_dict[nemo_param] | ||
|
||
mcore_model = get_mcore_model_from_nemo_file(input_nemo_file, cpu_only=cpu_only) | ||
mcore_model = load_model(mcore_model, mcore_state_dict) | ||
|
||
if nemo_model.cfg.tokenizer.model is not None: | ||
logging.info("registering artifact: tokenizer.model = " + nemo_tokenizer_model) | ||
mcore_model.register_artifact("tokenizer.model", nemo_tokenizer_model) | ||
|
||
mcore_model.cfg.use_cpu_initialization = False | ||
mcore_model.save_to(output_nemo_file) | ||
logging.info(f"Done. Model saved to {output_nemo_file}") | ||
|
||
|
||
def run_sanity_checks(nemo_file, mcore_file, cpu_only=False): | ||
|
||
nemo_model = restore_model(nemo_file, cpu_only=cpu_only).eval() | ||
mcore_model = restore_model(mcore_file, cpu_only=cpu_only).eval() | ||
|
||
logging.debug("*** Mcore model restored config") | ||
logging.debug(OmegaConf.to_yaml(mcore_model.cfg)) | ||
|
||
nemo_summary = nemo_model.summarize() | ||
mcore_summary = mcore_model.summarize() | ||
|
||
logging.info("Sanity checks:") | ||
|
||
# check num weights match | ||
assert nemo_summary.total_parameters == mcore_summary.total_parameters, "❌ total parameters do not match" | ||
assert nemo_summary.model_size == mcore_summary.model_size, "❌ model sizes do not match" | ||
logging.info("✅ Number of weights match") | ||
|
||
# check weights match | ||
mcore_state_dict = mcore_model.state_dict() | ||
nemo_state_dict = nemo_model.state_dict() | ||
with open_dict(nemo_model.cfg): | ||
nemo_model.cfg.megatron_amp_O2 = False # we want build_key_mapping in the next line to not use O2 prefix | ||
for mcore_param, nemo_param in build_key_mapping(nemo_model.cfg).items(): | ||
try: | ||
mcore_weight = mcore_state_dict.pop(mcore_param) | ||
nemo_weight = nemo_state_dict.pop(nemo_param) | ||
if mcore_param.endswith("linear_fc1.weight"): | ||
# linear_fc1.weight should map to concat(dense_h_to_4h.weight, dense_h_to_4h_2.weight) | ||
# but build_key_mapping only maps it to dense_h_to_4h.weight, so we handle the concat here. | ||
second_param = nemo_param.replace("dense_h_to_4h.weight", "dense_h_to_4h_2.weight") | ||
nemo_weight = torch.cat([nemo_weight, nemo_state_dict.pop(second_param)]) | ||
assert torch.allclose(mcore_weight, nemo_weight), f"❌ parameter {mcore_param} does not match" | ||
except KeyError: | ||
buffers = [k for k, v in mcore_model.named_buffers()] | ||
assert ( | ||
mcore_param in buffers or mcore_param.replace('model.', 'model.module.', 1) in buffers | ||
), f"❌ parameter {mcore_param} is not found in the state dict or named_buffers()" | ||
nemo_state_dict.pop(nemo_param) | ||
|
||
logging.info("✅ Weights match") | ||
|
||
# check for unexpected weights in state dict | ||
assert len(nemo_state_dict) == 0, f"❌ unexpected items in nemo_state_dict: {nemo_state_dict}" | ||
assert ( | ||
len([k for k in mcore_state_dict if not k.endswith('_extra_state')]) == 0 | ||
), f"❌ unexpected items in mcore_state_dict: {mcore_state_dict}" | ||
logging.info("✅ No unexpected weights in state dicts") | ||
|
||
|
||
if __name__ == '__main__': | ||
args = get_args() | ||
|
||
input_nemo_file = args.in_file | ||
output_nemo_file = args.out_file | ||
cpu_only = args.cpu_only | ||
|
||
os.makedirs(os.path.dirname(output_nemo_file), exist_ok=True) | ||
convert(input_nemo_file, output_nemo_file, skip_if_output_exists=True, cpu_only=cpu_only) | ||
torch.cuda.empty_cache() | ||
run_sanity_checks(input_nemo_file, output_nemo_file, cpu_only=cpu_only) |