Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Fix inappropriate content of model_name tag in Prometheus metrics #3937

Merged
merged 18 commits into from
May 4, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
9ad067d
[Bugfix] Change the "model_name" tag in exposed Prometheus metrics to…
DearPlanet Apr 9, 2024
c6adebc
Fix parameter parsing in EngineArgs
DearPlanet Apr 9, 2024
2b06a8b
Merge remote-tracking branch 'upstream/main' into fix_metrics_tags
DearPlanet Apr 25, 2024
a0e7407
refactor: reformat codes
DearPlanet Apr 25, 2024
5756a04
Revert "Fix parameter parsing in EngineArgs"
DearPlanet Apr 26, 2024
5f7fbbc
Move `served_model_name` to EngineArgs; adapt for name list input; ad…
DearPlanet Apr 26, 2024
119b0b2
Add testcase for `model_name` tag in metrics
DearPlanet Apr 26, 2024
c762bf2
Merge remote-tracking branch 'upstream/main' into fix_metrics_tags
DearPlanet Apr 26, 2024
9109130
Merge remote-tracking branch 'upstream/main' into fix_metrics_tags
DearPlanet Apr 28, 2024
55f405f
fix(params): `served_model_name` allow str input in `EngineArgs`
DearPlanet Apr 28, 2024
9d2fc25
Merge remote-tracking branch 'upstream/main' into fix_metrics_tags
DearPlanet Apr 29, 2024
827b2be
Merge remote-tracking branch 'upstream/main' into fix_metrics_tags
DearPlanet Apr 30, 2024
b25a645
Merge branch 'main' into fix_metrics_tags
DearPlanet May 4, 2024
27a5e8b
Update descriptions of `served_model_name` in vllm/config.py
DearPlanet May 4, 2024
0ab2e61
simplify condition judgements
DearPlanet May 4, 2024
7ea83a8
fix: `served_model_name` should be moved to `EngineArgs`
DearPlanet May 4, 2024
6410214
fix: fix test cases
DearPlanet May 4, 2024
a2a8fa0
Merge branch 'main' into fix_metrics_tags
May 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

import pytest

MODELS = [
Expand Down Expand Up @@ -68,3 +70,31 @@ def test_metric_counter_generation_tokens(
assert vllm_generation_count == metric_count, (
f"generation token count: {vllm_generation_count!r}\n"
f"metric: {metric_count!r}")


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"served_model_name",
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
served_model_name: List[str]) -> None:
vllm_model = vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.3,
served_model_name=served_model_name)
stat_logger = vllm_model.model.llm_engine.stat_logger
metrics_tag_content = stat_logger.labels["model_name"]

del vllm_model

if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, (
f"Metrics tag model_name is wrong! expect: {model!r}\n"
f"actual: {metrics_tag_content!r}")
else:
assert metrics_tag_content == served_model_name[0], (
f"Metrics tag model_name is wrong! expect: "
f"{served_model_name[0]!r}\n"
f"actual: {metrics_tag_content!r}")
27 changes: 27 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class ModelConfig:

Args:
model: Name or path of the huggingface model to use.
It is also used as the content for `model_name` tag in metrics
output when `served_model_name` is not specified.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
Expand Down Expand Up @@ -71,6 +73,10 @@ class ModelConfig:
to eager mode.
skip_tokenizer_init: If true, skip initialization of tokenizer and
detokenizer.
served_model_name: 'The model name used in metrics tag `model_name`,
same to the model name exposed in service. If multiple model names
provided, the first name will be used. If not specified, the model
name will be the same as `model`.
"""

def __init__(
Expand All @@ -91,6 +97,7 @@ def __init__(
max_context_len_to_capture: Optional[int] = None,
max_logprobs: int = 5,
skip_tokenizer_init: bool = False,
served_model_name: Optional[Union[str, List[str]]] = None,
) -> None:
self.model = model
self.tokenizer = tokenizer
Expand All @@ -113,6 +120,8 @@ def __init__(
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
max_model_len)
self.served_model_name = get_served_model_name(model,
served_model_name)
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
self._verify_quantization()
Expand Down Expand Up @@ -1108,6 +1117,24 @@ def _get_and_verify_max_len(
return int(max_model_len)


def get_served_model_name(model: str,
served_model_name: Optional[Union[str, List[str]]]):
"""
If the input is a non-empty list, the first model_name in
`served_model_name` is taken.
If the input is a non-empty string, it is used directly.
For cases where the input is either an empty string or an
empty list, the fallback is to use `self.model`.
"""
if isinstance(served_model_name, list) and served_model_name:
model_name_res = served_model_name[0]
elif isinstance(served_model_name, str) and served_model_name:
model_name_res = served_model_name
else:
model_name_res = model
return model_name_res


@dataclass
class DecodingConfig:
"""Dataclass which contains the decoding strategy of the engine"""
Expand Down
21 changes: 19 additions & 2 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import dataclasses
from dataclasses import dataclass
from typing import Optional
from typing import List, Optional, Union

from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
Expand All @@ -15,6 +15,7 @@
class EngineArgs:
"""Arguments for vLLM engine."""
model: str
served_model_name: Optional[Union[List[str]]] = None
tokenizer: Optional[str] = None
skip_tokenizer_init: bool = False
tokenizer_mode: str = 'auto'
Expand Down Expand Up @@ -458,6 +459,21 @@ def add_cli_args(
'This should be a JSON string that will be '
'parsed into a dictionary.')

parser.add_argument(
"--served-model-name",
nargs="+",
type=str,
default=None,
help="The model name(s) used in the API. If multiple "
"names are provided, the server will respond to any "
"of the provided names. The model name in the model "
"field of a response will be the first name in this "
"list. If not specified, the model name will be the "
"same as the `--model` argument. Noted that this name(s)"
"will also be used in `model_name` tag content of "
"prometheus metrics, if multiple names provided, metrics"
"tag will take the first one.")

return parser

@classmethod
Expand All @@ -476,7 +492,8 @@ def create_engine_config(self, ) -> EngineConfig:
self.code_revision, self.tokenizer_revision, self.max_model_len,
self.quantization, self.quantization_param_path,
self.enforce_eager, self.max_context_len_to_capture,
self.max_logprobs, self.skip_tokenizer_init)
self.max_logprobs, self.skip_tokenizer_init,
self.served_model_name)
cache_config = CacheConfig(self.block_size,
self.gpu_memory_utilization,
self.swap_space, self.kv_cache_dtype,
Expand Down
5 changes: 3 additions & 2 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(
"tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
"quantization_param_path=%s, device_config=%s, "
"decoding_config=%r, seed=%d)",
"decoding_config=%r, seed=%d, served_model_name=%s)",
vllm.__version__,
model_config.model,
speculative_config,
Expand All @@ -128,6 +128,7 @@ def __init__(
device_config.device,
decoding_config,
model_config.seed,
model_config.served_model_name,
)
# TODO(woosuk): Print more configs in debug mode.

Expand Down Expand Up @@ -218,7 +219,7 @@ def __init__(
if self.log_stats:
self.stat_logger = StatLogger(
local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
labels=dict(model_name=model_config.model),
labels=dict(model_name=model_config.served_model_name),
max_model_len=self.model_config.max_model_len)
self.stat_logger.info("cache_config", self.cache_config)

Expand Down
10 changes: 0 additions & 10 deletions vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,6 @@ def make_arg_parser():
default=None,
help="If provided, the server will require this key "
"to be presented in the header.")
parser.add_argument("--served-model-name",
nargs="+",
type=str,
default=None,
help="The model name(s) used in the API. If multiple "
"names are provided, the server will respond to any "
"of the provided names. The model name in the model "
"field of a response will be the first name in this "
"list. If not specified, the model name will be the "
"same as the `--model` argument.")
parser.add_argument(
"--lora-modules",
type=str,
Expand Down
Loading