Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable HF nllb conversion #204

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 113 additions & 38 deletions eole/bin/convert/convert_HF.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@
from eole.config import recursive_model_fields_set
from eole.config.models import (
TransformerEncoderModelConfig,
TransformerModelConfig,
TransformerLMModelConfig,
VisionTransformerLMModelConfig,
)
from eole.config.run import TrainConfig
from eole.config.training import TrainingConfig
from eole.constants import DefaultTokens, TORCH_DTYPES
from eole.constants import DefaultTokens, TORCH_DTYPES, PositionEncodingType
from eole.inputters.inputter import vocabs_to_dict


Expand Down Expand Up @@ -150,6 +151,44 @@
"adapter.w_out.weight": "multi_modal_projector.linear_2.weight",
"adapter.w_out.bias": "multi_modal_projector.linear_2.bias",
},
"M2M100ForConditionalGeneration": {
"encoder_layer_prefix": "model.encoder.layers.",
"decoder_layer_prefix": "model.decoder.layers.",
"src_emb.embeddings.weight": "model.encoder.embed_tokens.weight",
"tgt_emb.embeddings.weight": "model.decoder.embed_tokens.weight",
"decoder.layer_norm.weight": "model.decoder.layer_norm.weight",
"decoder.layer_norm.bias": "model.decoder.layer_norm.bias",
"encoder.layer_norm.weight": "model.encoder.layer_norm.weight",
"encoder.layer_norm.bias": "model.encoder.layer_norm.bias",
".self_attn.linear_query.": ".self_attn.q_proj.",
".self_attn.linear_keys.": ".self_attn.k_proj.",
".self_attn.linear_values.": ".self_attn.v_proj.",
".self_attn.final_linear.": ".self_attn.out_proj.",
".precontext_layernorm.weight": ".encoder_attn_layer_norm.weight",
".precontext_layernorm.bias": ".encoder_attn_layer_norm.bias",
".context_attn.linear_query.": ".encoder_attn.q_proj.",
".context_attn.linear_keys.": ".encoder_attn.k_proj.",
".context_attn.linear_values.": ".encoder_attn.v_proj.",
".context_attn.final_linear.": ".encoder_attn.out_proj.",
".mlp.gate_up_proj.": ".fc1.",
".mlp.down_proj.": ".fc2.",
".input_layernorm.weight": ".self_attn_layer_norm.weight",
".input_layernorm.bias": ".self_attn_layer_norm.bias",
".post_attention_layernorm.weight": ".final_layer_norm.weight",
".post_attention_layernorm.bias": ".final_layer_norm.bias",
"encoder": {
".self_attn.linear_query.": ".self_attn.q_proj.",
".self_attn.linear_keys.": ".self_attn.k_proj.",
".self_attn.linear_values.": ".self_attn.v_proj.",
".self_attn.final_linear.": ".self_attn.out_proj.",
".mlp.gate_up_proj.": ".fc1.",
".mlp.down_proj.": ".fc2.",
".input_layernorm.weight": ".self_attn_layer_norm.weight",
".input_layernorm.bias": ".self_attn_layer_norm.bias",
".post_attention_layernorm.weight": ".final_layer_norm.weight",
".post_attention_layernorm.bias": ".final_layer_norm.bias",
},
},
}

# Combine base mappings with overrides
Expand All @@ -163,6 +202,7 @@
"GPT2LMHeadModel": "standard",
"XLMRobertaXLForMaskedLM": "standard",
"Gemma2ForCausalLM": "gemma-rms",
"M2M100ForConditionalGeneration": "standard",
},
)

Expand All @@ -174,6 +214,14 @@
"GPT2LMHeadModel": "gelu",
"XLMRobertaXLForMaskedLM": "gelu",
"Gemma2ForCausalLM": "gated-gelu",
"M2M100ForConditionalGeneration": "relu",
},
)

EMBED_TABLE = defaultdict(
lambda: PositionEncodingType.Rotary,
{
"M2M100ForConditionalGeneration": PositionEncodingType.SinusoidalConcat,
},
)

Expand All @@ -183,6 +231,7 @@
{
"XLMRobertaXLForMaskedLM": TransformerEncoderModelConfig,
"LlavaForConditionalGeneration": VisionTransformerLMModelConfig,
"M2M100ForConditionalGeneration": TransformerModelConfig,
},
)

Expand Down Expand Up @@ -426,11 +475,12 @@ def build_config_dict(hf):
# Initialize model_config with defaults and fallbacks
model_config = {
"layers": config.get("num_hidden_layers", config.get("n_layer", config.get("n_layers"))),
"hidden_size": config.get("hidden_size", config.get("n_embd", config.get("hidden_dim"))),
"hidden_size": config.get("hidden_size", config.get("n_embd", config.get("hidden_dim", config.get("d_model")))),
"heads": config.get(
"num_attention_heads", config.get("n_head", config.get("n_heads", 32))
"num_attention_heads",
config.get("n_head", config.get("n_heads", config.get("decoder_attention_heads", 32))),
), # default 32 patch for mistral-community/pixtral-12b
"transformer_ff": config.get("intermediate_size", config.get("hidden_size", config.get("n_embd")) * 4),
"transformer_ff": config.get("intermediate_size", config.get("decoder_ffn_dim", None)),
"mlp_activation_fn": ACT_TABLE[arch],
"layer_norm": LN_TABLE[arch],
"heads_kv": config.get("multi_query", False)
Expand Down Expand Up @@ -462,25 +512,46 @@ def build_config_dict(hf):
"embeddings": {}, # Populated later
}

# patch transformer_ff
if model_config["transformer_ff"] is None:
model_config["transformer_ff"] = model_config["hidden_size"] * 4

# patch sliding window
if model_config["sliding_window"] is None:
model_config["sliding_window"] = 4096

# Populate embeddings
model_config["embeddings"] = {
"src_word_vec_size": model_config["hidden_size"],
"tgt_word_vec_size": model_config["hidden_size"],
}

# Position encoding configuration
model_config["embeddings"].update(
{
"position_encoding_type": EMBED_TABLE[arch],
# "n_positions": 0,
}
)
# if "max_position_embeddings" in config.keys():
# model_config["embeddings"]["n_positions"] = config["max_position_embeddings"]

# patch rotary dim
if "rotary_dim" in config.keys():
model_config["rope_config"]["rotary_dim"] = config["rotary_dim"]
elif "partial_rotary_factor" in config.keys():
model_config["rope_config"]["rotary_dim"] = int(
config["partial_rotary_factor"] * (model_config["hidden_size"] // model_config["heads"])
)
elif model_config.get("head_dim", None) is not None:
model_config["rope_config"]["rotary_dim"] = model_config["head_dim"]
else:
model_config["rope_config"]["rotary_dim"] = model_config["hidden_size"] // model_config["heads"]
if EMBED_TABLE[arch] == PositionEncodingType.Rotary:
if "rotary_dim" in config.keys():
model_config["rope_config"]["rotary_dim"] = config["rotary_dim"]
elif "partial_rotary_factor" in config.keys():
model_config["rope_config"]["rotary_dim"] = int(
config["partial_rotary_factor"] * (model_config["hidden_size"] // model_config["heads"])
)
elif model_config.get("head_dim", None) is not None:
model_config["rope_config"]["rotary_dim"] = model_config["head_dim"]
else:
model_config["rope_config"]["rotary_dim"] = model_config["hidden_size"] // model_config["heads"]

# patch rotary theta
if "rotary_theta" in config.keys():
model_config["rope_config"]["rotary_theta"] = config["rotary_theta"]
# patch rotary theta
if "rotary_theta" in config.keys():
model_config["rope_config"]["rotary_theta"] = config["rotary_theta"]

# Validate required fields
required_fields = {
Expand All @@ -493,12 +564,6 @@ def build_config_dict(hf):
if model_config[key] is None:
raise ValueError(error_msg)

# Populate embeddings
model_config["embeddings"] = {
"src_word_vec_size": model_config["hidden_size"],
"tgt_word_vec_size": model_config["hidden_size"],
}

# Update rope scaling related settings
if config.get("rope_scaling", None) is not None:
model_config["rope_config"].update(
Expand Down Expand Up @@ -549,14 +614,6 @@ def build_config_dict(hf):

model_config["share_decoder_embeddings"] = config.get("tie_word_embeddings", False)

# Position encoding configuration
model_config["embeddings"].update(
{
"position_encoding_type": "Rotary",
"n_positions": 0,
}
)

# Define architecture-specific configurations
arch_configs = {
"PhiForCausalLM": {
Expand Down Expand Up @@ -600,6 +657,18 @@ def build_config_dict(hf):
"normalize": True,
},
},
"M2M100ForConditionalGeneration": {
"parallel_residual": False,
"add_qkvbias": True,
"add_final_linear_bias": True,
"add_ffnbias": True,
"embeddings": {
"position_encoding_type": "SinusoidalConcat",
"n_positions": 1024,
},
"left_pad": False,
"share_decoder_embeddings": True,
},
}

# Vision encoder
Expand Down Expand Up @@ -858,6 +927,7 @@ def build_first_shard(hf, eole_safetensor):
for module in [
"input_layernorm",
"layer_norm_res",
"precontext_layernorm",
"post_attention_layernorm",
"pre_feedforward_layernorm",
"post_feedforward_layernorm",
Expand Down Expand Up @@ -904,13 +974,16 @@ def build_first_shard(hf, eole_safetensor):

def check_sentencepiece_tokenizer(hf):
tokenizer_basename = os.path.basename(hf.tokenizer_model)
vocab = get_sentencepiece_vocab(hf.tokenizer_model)
if hf.tokenizer_json is not None:
# We need to add 'added_tokens' that are not in the SP model
newtokens = [tok["content"] for tok in hf.tokenizer["added_tokens"] if tok["content"] not in vocab]
vocab.extend(newtokens)
for tok in hf.tokenizer["added_tokens"]:
vocab[tok["id"]] = tok["content"]
vocab = list(hf.tokenizer["model"]["vocab"].keys())
else:
vocab = get_sentencepiece_vocab(hf.tokenizer_model)
if hf.tokenizer_json is not None:
# We need to add 'added_tokens' that are not in the SP model
newtokens = [tok["content"] for tok in hf.tokenizer["added_tokens"] if tok["content"] not in vocab]
vocab.extend(newtokens)
for tok in hf.tokenizer["added_tokens"]:
vocab[tok["id"]] = tok["content"]
src_vocab = pyonmttok.build_vocab_from_tokens(
vocab,
)
Expand Down Expand Up @@ -1068,7 +1141,9 @@ def run(cls, args):
},
}

if add_bos_token:
if hf.config.get("decoder_start_token_id", None) is not None:
vocabs["decoder_start_token"] = src_vocab.ids_to_tokens[hf.config["decoder_start_token_id"]]
elif add_bos_token:
vocabs["decoder_start_token"] = vocabs["specials"]["bos_token"]
else:
vocabs["decoder_start_token"] = ""
Expand Down
6 changes: 5 additions & 1 deletion eole/inputters/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def numericalize(vocabs, example, model_type=ModelType.ENCODER_DECODER):
numeric["src"]["src_ids"] = vocabs["src"](src_text)
if example.get("tgt", None) is not None:
if maybe_tgt_ids != []:
# TODO: handle this better in HF tokenizer templates
if decoder_start_token != "":
decoder_start_token_id = vocabs["tgt"].tokens_to_ids[decoder_start_token]
if maybe_tgt_ids[0] != decoder_start_token_id:
maybe_tgt_ids = [decoder_start_token_id] + maybe_tgt_ids
numeric["tgt"]["tgt_ids"] = maybe_tgt_ids
else:
tgt_text = example["tgt"]["tgt"].split(" ")
Expand All @@ -89,7 +94,6 @@ def numericalize(vocabs, example, model_type=ModelType.ENCODER_DECODER):
numeric["src"]["src_ids"] = vocabs["src"](src_text)
if example["tgt"] is not None:
if maybe_tgt_ids != []:
# decoder_start_token logic is supposedly handled in the tokenizer
numeric["tgt"]["tgt_ids"] = maybe_tgt_ids
else:
tgt_text = example["tgt"]["tgt"].split(" ")
Expand Down
35 changes: 35 additions & 0 deletions recipes/nllb/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# NLLB

## Conversion

### 1. Sentencepiece with OpenNMT Tokenizer

```bash
eole convert HF --model_dir facebook/nllb-200-1.3B --output ./nllb-1.3b --token $HF_TOKEN --tokenizer onmt
```

### 2. HuggingFace Tokenizer

```bash
eole convert HF --model_dir facebook/nllb-200-1.3B --output ./nllb-1.3b --token $HF_TOKEN
```


## Inference

```bash
echo "What is the weather like in Tahiti?" > test.en
```


### 1. Sentencepiece with OpenNMT Tokenizer

```bash
eole predict -c inference-pyonmttok.yaml
```

### 2. HuggingFace Tokenizer

```bash
eole predict -c inference-hf.yaml
```
17 changes: 17 additions & 0 deletions recipes/nllb/inference-hf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
model_path: "nllb-1.3b"
transforms: ["prefix", "huggingface_tokenize"]
transforms_configs:
prefix:
src_prefix: "</s> eng_Latn"
tgt_prefix: "deu_Latn"
huggingface_tokenize:
huggingface_model: facebook/nllb-200-1.3B

tgt_file_prefix: true

gpu_ranks: [0]
world_size: 1
beam_size: 5

src: test.en
output: test.de
21 changes: 21 additions & 0 deletions recipes/nllb/inference-pyonmttok.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_path: "nllb-1.3b"
# transforms: ["sentencepiece", "prefix"]
transforms: ["prefix", "huggingface_tokenize"]
transforms_configs:
prefix:
src_prefix: "</s> eng_Latn"
tgt_prefix: "deu_Latn"
# sentencepiece:
# src_subword_model: ./flores200_sacrebleu_tokenizer_spm.model
# tgt_subword_model: ./flores200_sacrebleu_tokenizer_spm.model
huggingface_tokenize:
huggingface_model: facebook/nllb-200-1.3B

tgt_file_prefix: true

gpu_ranks: [0]
world_size: 1
beam_size: 5

src: test.en
output: test.de