Skip to content

Commit

Permalink
[Model] support minicpm3 (vllm-project#8297)
Browse files Browse the repository at this point in the history
Co-authored-by: DarkLight1337 <[email protected]>
  • Loading branch information
2 people authored and siddharth9820 committed Sep 30, 2024
1 parent acdcd8b commit 0da711f
Show file tree
Hide file tree
Showing 7 changed files with 281 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"

# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \
Expand Down
4 changes: 4 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ Decoder-only Language Models
- MiniCPM
- :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
-
* - :code:`MiniCPM3ForCausalLM`
- MiniCPM3
- :code:`openbmb/MiniCPM3-4B`, etc.
-
* - :code:`MistralForCausalLM`
- Mistral, Mistral-Instruct
- :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
Expand Down
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
datamodel_code_generator # required for minicpm3 test

# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
Expand Down
15 changes: 9 additions & 6 deletions tests/models/decoder_only/language/test_big_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
Run `pytest tests/models/test_big_models.py`.
"""
import pytest
import torch

from vllm.platforms import current_platform

from ...utils import check_outputs_equal

Expand All @@ -19,10 +20,12 @@
# "Qwen/Qwen1.5-0.5B" # Broken,
]

if not current_platform.is_cpu():
# MiniCPM requires fused_moe which is not supported by CPU
MODELS.append("openbmb/MiniCPM3-4B")

#TODO: remove this after CPU float16 support ready
target_dtype = "float"
if torch.cuda.is_available():
target_dtype = "half"
target_dtype = "float" if current_platform.is_cpu() else "half"


@pytest.mark.parametrize("model", MODELS)
Expand All @@ -39,7 +42,7 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

with vllm_runner(model, dtype=dtype) as vllm_model:
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

check_outputs_equal(
Expand All @@ -57,7 +60,7 @@ def test_model_print(
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
Expand Down
1 change: 1 addition & 0 deletions vllm/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
"MptForCausalLM": ("mpt", "MPTForCausalLM"),
"MPTForCausalLM": ("mpt", "MPTForCausalLM"),
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
"MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
"NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
"OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
"OPTForCausalLM": ("opt", "OPTForCausalLM"),
Expand Down
79 changes: 49 additions & 30 deletions vllm/model_executor/models/minicpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,38 +270,47 @@ def __init__(
) -> None:
super().__init__()
self.config = config
self.cache_config = cache_config
self.quant_config = quant_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings",
8192)
self.rope_theta = getattr(config, "rope_theta", 10000)
self.rope_scaling = getattr(config, "rope_scaling", None)
self.max_position_embeddings = getattr(config,
"max_position_embeddings", 8192)
self._init_attn_block()
self._init_ffn_block()

def _init_attn_block(self):
self.input_layernorm = RMSNorm(self.config.hidden_size,
eps=self.config.rms_norm_eps)
self.self_attn = MiniCPMAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
num_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_key_value_heads,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config,
quant_config=self.quant_config,
)

def _init_ffn_block(self):
self.post_attention_layernorm = RMSNorm(self.config.hidden_size,
eps=self.config.rms_norm_eps)
self.num_experts = getattr(self.config, "num_experts", 0)
if self.num_experts == 0:
self.mlp = MiniCPMMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
quant_config=quant_config,
intermediate_size=self.config.intermediate_size,
hidden_act=self.config.hidden_act,
quant_config=self.quant_config,
)
else:
self.mlp = MiniCPMMoE(num_experts=config.num_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size)
self.input_layernorm = RMSNorm(config.hidden_size,
eps=config.rms_norm_eps)
self.post_attention_layernorm = RMSNorm(config.hidden_size,
eps=config.rms_norm_eps)
self.mlp = MiniCPMMoE(
num_experts=self.config.num_experts,
top_k=self.config.num_experts_per_tok,
hidden_size=self.config.hidden_size,
intermediate_size=self.config.intermediate_size)

def forward(
self,
Expand Down Expand Up @@ -344,6 +353,8 @@ def __init__(
) -> None:
super().__init__()
self.config = config
self.cache_config = cache_config
self.quant_config = quant_config
self.padding_idx = config.pad_token_id
lora_vocab = (lora_config.lora_extra_vocab_size *
(lora_config.max_loras or 1)) if lora_config else 0
Expand All @@ -354,11 +365,15 @@ def __init__(
config.hidden_size,
org_num_embeddings=config.vocab_size,
)
self._init_layers()
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

def _init_layers(self):
self.layers = nn.ModuleList([
MiniCPMDecoderLayer(config, cache_config, quant_config)
for _ in range(config.num_hidden_layers)
MiniCPMDecoderLayer(self.config, self.cache_config,
self.quant_config)
for _ in range(self.config.num_hidden_layers)
])
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
embedding = self.embed_tokens(input_ids)
Expand Down Expand Up @@ -431,13 +446,11 @@ def __init__(

self.config = config
self.lora_config = lora_config
self.cache_config = cache_config
self.quant_config = quant_config

self.num_experts = getattr(self.config, "num_experts", 0)
self.quant_config = quant_config
self.model = MiniCPMModel(config,
cache_config,
quant_config,
lora_config=lora_config)
self._init_model()
unpadded_vocab_size = config.vocab_size
if lora_config:
unpadded_vocab_size += lora_config.lora_extra_vocab_size
Expand All @@ -458,6 +471,12 @@ def __init__(
config.vocab_size)
self.sampler = Sampler()

def _init_model(self):
self.model = MiniCPMModel(config=self.config,
cache_config=self.cache_config,
quant_config=self.quant_config,
lora_config=self.lora_config)

def forward(
self,
input_ids: torch.Tensor,
Expand Down
Loading

0 comments on commit 0da711f

Please sign in to comment.