vllm-project · youkaichao · Feb 3, 2025 · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -40,6 +40,20 @@ If vLLM successfully returns text (for generative models) or hidden states (for
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
+### Transformers fallback
+
+After the merge of <gh-pr:11330>, `vllm` can fallback to models that are avaialble in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+
+To check if the backend is `transformers`, you can simply do this: 
+
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate")  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+
+If it is `TransformersModel` then it means it's based on `transformers`! 
+
 ### ModelScope
 
 To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:

diff --git a/requirements-common.txt b/requirements-common.txt
@@ -5,7 +5,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.45.2  # Required for Llama 3.2 and Qwen2-VL.
+transformers >= 4.48.0  # Required for Transformers model.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'

@@ -0,0 +1,107 @@
+"""Test the functionality of the Transformers backend.
+
+Run `pytest tests/models/test_transformers.py`.
+"""
+from contextlib import nullcontext
+from typing import Type
+
+import pytest
+
+from vllm.model_executor.models import ModelRegistry
+
+from ..conftest import HfRunner, VllmRunner
+from ..utils import multi_gpu_marks
+from .utils import check_logprobs_close
+
+# Delete Llama from registry so we can pretend vLLM doesn't support it
+del ModelRegistry.models["LlamaForCausalLM"]
+
+# Code used to generate the ilama model:
+# from transformers import AutoConfig, AutoModel, LlamaConfig, LlamaModel
+#
+# class IlamaConfig(LlamaConfig):
+#     model_type = "iiama"
+
+# class IlamaModel(LlamaModel):
+#     config_class = IlamaConfig
+
+# AutoConfig.register("iiama", IlamaConfig)
+# AutoModel.register(IlamaConfig, IlamaModel)
+
+# base_model = LlamaModel.from_pretrained("meta-llama/Llama-3.2-1B", torch_dtype="auto")
+# remote_model = IlamaModel._from_config(base_model.config)
+# remote_model.load_state_dict(base_model.state_dict())
+# remote_model.push_to_hub("ArthurZ/Ilama-3.2-1B")
+
+
+def check_implementation(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    **kwargs,
+):
+    max_tokens = 32
+    num_logprobs = 5
+
+    with vllm_runner(model, **kwargs) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with hf_runner(model, **kwargs) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model,model_impl",
+                         [("openai-community/gpt2", "transformers"),
+                          ("meta-llama/Llama-3.2-1B-Instruct", "auto"),
+                          ("ArthurZ/Ilama-3.2-1B", "auto", True)])
+def test_models(hf_runner,
+                vllm_runner,
+                example_prompts,
+                model,
+                model_impl,
+                trust_remote_code=None) -> None:
+
+    maybe_raises = nullcontext()
+    if model == "openai-community/gpt2" and model_impl == "transformers":
+        maybe_raises = pytest.raises(
+            ValueError,
+            match="The Transformers implementation.*not compatible with vLLM")
+
+    with maybe_raises:
+        check_implementation(hf_runner,
+                             vllm_runner,
+                             example_prompts,
+                             model,
+                             model_impl=model_impl,
+                             trust_remote_code=trust_remote_code)
+
+
+@multi_gpu_marks(num_gpus=2)
+def test_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+):
+    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
+    check_implementation(hf_runner, vllm_runner, example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
+
+
+def test_quantized(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+):
+    kwargs = {"model_impl": "transformers"}
+    check_implementation(hf_runner, vllm_runner, example_prompts,
+                         "unsloth/Llama-3.2-1B-Instruct-bnb-4bit", **kwargs)
diff --git a/vllm/config.py b/vllm/config.py
@@ -81,6 +81,12 @@ def compute_hash(self) -> str:
         ...
 
 
+class ModelImpl(str, enum.Enum):
+    AUTO = "auto"
+    VLLM = "vllm"
+    TRANSFORMERS = "transformers"
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -165,6 +171,12 @@ class ModelConfig:
             `logits_processors` extra completion argument. Defaults to None,
             which allows no processors.
         generation_config: Configuration parameter file for generation.
+        model_impl: Which implementation of the model to use:
+            "auto" will try to use the vLLM implementation if it exists and
+                fall back to the Transformers implementation if no vLLM
+                implementation is available.
+            "vllm" will use the vLLM model implementation.
+            "transformers" will use the Transformers model implementation.
     """
 
     def compute_hash(self) -> str:
@@ -225,6 +237,7 @@ def __init__(
         logits_processor_pattern: Optional[str] = None,
         generation_config: Optional[str] = None,
         enable_sleep_mode: bool = False,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -236,6 +249,7 @@ def __init__(
         self.code_revision = code_revision
         self.rope_scaling = rope_scaling
         self.rope_theta = rope_theta
+        self.model_impl = model_impl
 
         if hf_overrides is None:
             hf_overrides = {}

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -11,10 +11,10 @@
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                          DecodingConfig, DeviceConfig, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
-                         VllmConfig)
+                         ModelConfig, ModelImpl, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
+                         SchedulerConfig, SpeculativeConfig, TaskOption,
+                         TokenizerPoolConfig, VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -196,6 +196,7 @@ class EngineArgs:
 
     generation_config: Optional[str] = None
     enable_sleep_mode: bool = False
+    model_impl: str = "auto"
 
     calculate_kv_scales: Optional[bool] = None
 
@@ -375,6 +376,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'qualified names that can be passed with the `logits_processors` '
             'extra completion argument. Defaults to None, which allows no '
             'processors.')
+        parser.add_argument(
+            '--model-impl',
+            type=str,
+            default=EngineArgs.model_impl,
+            choices=[f.value for f in ModelImpl],
+            help='Which implementation of the model to use.\n\n'
+            '* "auto" will try to use the vLLM implementation if it exists '
+            'and fall back to the Transformers implementation if no vLLM '
+            'implementation is available.\n'
+            '* "vllm" will use the vLLM model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            'implementation.\n')
         # Parallel arguments
         parser.add_argument(
             '--distributed-executor-backend',
@@ -1003,6 +1016,7 @@ def create_model_config(self) -> ModelConfig:
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
             enable_sleep_mode=self.enable_sleep_mode,
+            model_impl=self.model_impl,
         )
 
     def create_load_config(self) -> LoadConfig:

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -4,14 +4,18 @@
 from typing import Dict, List, Tuple, Type
 
 import torch
+import transformers
 from torch import nn
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, ModelImpl
+from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_classification_model,
                                                  as_embedding_model,
                                                  as_reward_model)
 
+logger = init_logger(__name__)
+
 
 @contextlib.contextmanager
 def set_default_torch_dtype(dtype: torch.dtype):
@@ -22,6 +26,14 @@ def set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
 
 
+def is_transformers_impl_compatible(arch: str) -> bool:
+    arch: transformers.PreTrainedModel = getattr(transformers, arch)
+    if hasattr(arch, "supports_backend"):
+        return arch.is_backend_compatible()
+    else:
+        return arch._supports_flex_attn
+
+
 def get_model_architecture(
         model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
@@ -37,6 +49,27 @@ def get_model_architecture(
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
+    vllm_supported_archs = ModelRegistry.get_supported_archs()
+    for i, arch in enumerate(architectures):
+        if arch == "TransformersModel":
+            continue
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            if not is_transformers_impl_compatible(arch):
+                raise ValueError(
+                    "The Transformers implementation of %s is not compatible "
+                    "with vLLM.", arch)
+            architectures[i] = "TransformersModel"
+        if (model_config.model_impl == ModelImpl.AUTO
+                and arch not in vllm_supported_archs):
+            if not is_transformers_impl_compatible(arch):
+                raise ValueError(
+                    "%s has no vLLM implementation and the Transformers "
+                    "implementationis not compatible with vLLM.", arch)
+            logger.info(
+                "%s has no vLLM implementation, falling back to "
+                "Transformers implementation", arch)
+            architectures[i] = "TransformersModel"
+
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -182,6 +182,10 @@
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
+
+_FALLBACK_MODEL = {
+    "TransformersModel": ("transformers", "TransformersModel"),
+}
 # yapf: enable
 
 _VLLM_MODELS = {
@@ -190,6 +194,7 @@
     **_CROSS_ENCODER_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
+    **_FALLBACK_MODEL,
 }
 
 
@@ -376,7 +381,13 @@ def _normalize_archs(
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        return architectures
+        normalized_arch = []
+        for model in architectures:
+            model = self.models.get(model)
+            if model is None:
+                model = self.models[next(iter(_FALLBACK_MODEL))]
+            normalized_arch.append(model)
+        return normalized_arch
 
     def inspect_model_cls(
         self,