vllm-project · njhill · May 4, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 25, 2024
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 MODELS = [
@@ -68,3 +70,31 @@ def test_metric_counter_generation_tokens(
     assert vllm_generation_count == metric_count, (
         f"generation token count: {vllm_generation_count!r}\n"
         f"metric: {metric_count!r}")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "served_model_name",
+    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
+def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
+                                   served_model_name: List[str]) -> None:
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.3,
+                             served_model_name=served_model_name)
+    stat_logger = vllm_model.model.llm_engine.stat_logger
+    metrics_tag_content = stat_logger.labels["model_name"]
+
+    del vllm_model
+
+    if served_model_name is None or served_model_name == []:
+        assert metrics_tag_content == model, (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+            f"actual: {metrics_tag_content!r}")
+    else:
+        assert metrics_tag_content == served_model_name[0], (
+            f"Metrics tag model_name is wrong! expect: "
+            f"{served_model_name[0]!r}\n"
+            f"actual: {metrics_tag_content!r}")
diff --git a/vllm/config.py b/vllm/config.py
@@ -36,6 +36,8 @@ class ModelConfig:
 
     Args:
         model: Name or path of the huggingface model to use.
+            It is also used as the content for `model_name` tag in metrics 
+            output when `served_model_name` is not specified. 
         tokenizer: Name or path of the huggingface tokenizer to use.
         tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
             available, and "slow" will always use the slow tokenizer.
@@ -71,6 +73,10 @@ class ModelConfig:
             to eager mode.
         skip_tokenizer_init: If true, skip initialization of tokenizer and
             detokenizer.
+        served_model_name: 'The model name used in metrics tag `model_name`,
+            same to the model name exposed in service. If multiple model names 
+            provided, the first name will be used. If not specified, the model
+            name will be the same as `model`.
     """
 
     def __init__(
@@ -91,6 +97,7 @@ def __init__(
         max_context_len_to_capture: Optional[int] = None,
         max_logprobs: int = 5,
         skip_tokenizer_init: bool = False,
+        served_model_name: Optional[Union[str, List[str]]] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -113,6 +120,8 @@ def __init__(
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
                                                      max_model_len)
+        self.served_model_name = get_served_model_name(model,
+                                                       served_model_name)
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
         self._verify_quantization()
@@ -1108,6 +1117,24 @@ def _get_and_verify_max_len(
     return int(max_model_len)
 
 
+def get_served_model_name(model: str,
+                          served_model_name: Optional[Union[str, List[str]]]):
+    """
+    If the input is a non-empty list, the first model_name in 
+    `served_model_name` is taken. 
+    If the input is a non-empty string, it is used directly. 
+    For cases where the input is either an empty string or an 
+    empty list, the fallback is to use `self.model`.
+    """
+    if isinstance(served_model_name, list) and served_model_name:
+        model_name_res = served_model_name[0]
+    elif isinstance(served_model_name, str) and served_model_name:
+        model_name_res = served_model_name
+    else:
+        model_name_res = model
+    return model_name_res
+
+
 @dataclass
 class DecodingConfig:
     """Dataclass which contains the decoding strategy of the engine"""

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1,7 +1,7 @@
 import argparse
 import dataclasses
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional, Union
 
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
                          EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
@@ -15,6 +15,7 @@
 class EngineArgs:
     """Arguments for vLLM engine."""
     model: str
+    served_model_name: Optional[Union[List[str]]] = None
     tokenizer: Optional[str] = None
     skip_tokenizer_init: bool = False
     tokenizer_mode: str = 'auto'
@@ -458,6 +459,21 @@ def add_cli_args(
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
 
+        parser.add_argument(
+            "--served-model-name",
+            nargs="+",
+            type=str,
+            default=None,
+            help="The model name(s) used in the API. If multiple "
+            "names are provided, the server will respond to any "
+            "of the provided names. The model name in the model "
+            "field of a response will be the first name in this "
+            "list. If not specified, the model name will be the "
+            "same as the `--model` argument. Noted that this name(s)"
+            "will also be used in `model_name` tag content of "
+            "prometheus metrics, if multiple names provided, metrics"
+            "tag will take the first one.")
+
         return parser
 
     @classmethod
@@ -476,7 +492,8 @@ def create_engine_config(self, ) -> EngineConfig:
             self.code_revision, self.tokenizer_revision, self.max_model_len,
             self.quantization, self.quantization_param_path,
             self.enforce_eager, self.max_context_len_to_capture,
-            self.max_logprobs, self.skip_tokenizer_init)
+            self.max_logprobs, self.skip_tokenizer_init,
+            self.served_model_name)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,

@@ -105,7 +105,7 @@ def __init__(
             "tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
             "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
-            "decoding_config=%r, seed=%d)",
+            "decoding_config=%r, seed=%d, served_model_name=%s)",
             vllm.__version__,
             model_config.model,
             speculative_config,
@@ -128,6 +128,7 @@ def __init__(
             device_config.device,
             decoding_config,
             model_config.seed,
+            model_config.served_model_name,
         )
         # TODO(woosuk): Print more configs in debug mode.
 
@@ -218,7 +219,7 @@ def __init__(
         if self.log_stats:
             self.stat_logger = StatLogger(
                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                labels=dict(model_name=model_config.model),
+                labels=dict(model_name=model_config.served_model_name),
                 max_model_len=self.model_config.max_model_len)
             self.stat_logger.info("cache_config", self.cache_config)
 

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
@@ -53,16 +53,6 @@ def make_arg_parser():
                         default=None,
                         help="If provided, the server will require this key "
                         "to be presented in the header.")
-    parser.add_argument("--served-model-name",
-                        nargs="+",
-                        type=str,
-                        default=None,
-                        help="The model name(s) used in the API. If multiple "
-                        "names are provided, the server will respond to any "
-                        "of the provided names. The model name in the model "
-                        "field of a response will be the first name in this "
-                        "list. If not specified, the model name will be the "
-                        "same as the `--model` argument.")
     parser.add_argument(
         "--lora-modules",
         type=str,