diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 11c122f839b10..3dcc242803752 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -109,7 +109,7 @@ Decoder-only Language Models - * - :code:`MiniCPM3ForCausalLM` - MiniCPM3 - - :code:`openbmb/MiniCPM3-4B` + - :code:`openbmb/MiniCPM3-4B`, etc. - * - :code:`MistralForCausalLM` - Mistral, Mistral-Instruct diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index c5c10eeecdea1..5168210f15013 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -5,7 +5,8 @@ Run `pytest tests/models/test_big_models.py`. """ import pytest -import torch + +from vllm.platforms import current_platform from ...utils import check_outputs_equal @@ -21,9 +22,7 @@ ] #TODO: remove this after CPU float16 support ready -target_dtype = "float" -if torch.cuda.is_available(): - target_dtype = "half" +target_dtype = "float" if current_platform.is_cpu() else "half" @pytest.mark.parametrize("model", MODELS) @@ -37,6 +36,9 @@ def test_models( dtype: str, max_tokens: int, ) -> None: + if model.startswith("openbmb/MiniCPM3") and current_platform.is_cpu(): + pytest.skip("MiniCPM requires fused_moe which is not supported by CPU") + with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)