Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Production Metrics in Prometheus format #1890

Merged
merged 8 commits into from
Dec 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Documentation
serving/deploying_with_triton
serving/deploying_with_docker
serving/serving_with_langchain
serving/metrics

.. toctree::
:maxdepth: 1
Expand Down
13 changes: 13 additions & 0 deletions docs/source/serving/metrics.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Production Metrics
==================

vLLM exposes a number of metrics that can be used to monitor the health of the
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
OpenAI compatible API server.

The following metrics are exposed:

.. literalinclude:: ../../../vllm/engine/metrics.py
:language: python
:start-after: begin-metrics-definitions
:end-before: end-metrics-definitions
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ xformers >= 0.0.22.post7 # Required for CUDA 12.1.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
aioprometheus[starlette]
15 changes: 13 additions & 2 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
SchedulerConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics import record_metrics
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
Expand Down Expand Up @@ -591,8 +592,8 @@ def _log_system_stats(
else:
self.num_generation_tokens.append((now, num_batched_tokens))

elapsed_time = now - self.last_logging_time
if elapsed_time < _LOGGING_INTERVAL_SEC:
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
if not should_log:
return

# Discard the old stats.
Expand Down Expand Up @@ -631,6 +632,16 @@ def _log_system_stats(
else:
cpu_cache_usage = 0.0

record_metrics(
avg_prompt_throughput=avg_prompt_throughput,
avg_generation_throughput=avg_generation_throughput,
scheduler_running=len(self.scheduler.running),
scheduler_swapped=len(self.scheduler.swapped),
scheduler_waiting=len(self.scheduler.waiting),
gpu_cache_usage=gpu_cache_usage,
cpu_cache_usage=cpu_cache_usage,
)

logger.info("Avg prompt throughput: "
f"{avg_prompt_throughput:.1f} tokens/s, "
"Avg generation throughput: "
Expand Down
51 changes: 51 additions & 0 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from aioprometheus import Gauge

# The begin-* and end* here are used by the documentation generator
# to extract the metrics definitions.

# begin-metrics-definitions
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
"Average prefill throughput in tokens/s.")
gauge_avg_generation_throughput = Gauge(
"vllm:avg_generation_throughput_toks_per_s",
"Average generation throughput in tokens/s.")

gauge_scheduler_running = Gauge(
"vllm:num_requests_running",
"Number of requests that is currently running for inference.")
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
"Number requests swapped to CPU.")
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
"Number of requests waiting to be processed.")

gauge_gpu_cache_usage = Gauge(
"vllm:gpu_cache_usage_perc",
"GPU KV-cache usage. 1 means 100 percent usage.")
gauge_cpu_cache_usage = Gauge(
"vllm:cpu_cache_usage_perc",
"CPU KV-cache usage. 1 means 100 percent usage.")
# end-metrics-definitions

labels = {}


def add_global_metrics_labels(**kwargs):
labels.update(kwargs)


def record_metrics(
avg_prompt_throughput: float,
avg_generation_throughput: float,
scheduler_running: int,
scheduler_swapped: int,
scheduler_waiting: int,
gpu_cache_usage: float,
cpu_cache_usage: float,
):
gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
gauge_scheduler_running.set(labels, scheduler_running)
gauge_scheduler_swapped.set(labels, scheduler_swapped)
gauge_scheduler_waiting.set(labels, scheduler_waiting)
gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
10 changes: 10 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from http import HTTPStatus
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union

from aioprometheus import MetricsMiddleware
from aioprometheus.asgi.starlette import metrics
import fastapi
import uvicorn
from fastapi import Request
Expand All @@ -18,6 +20,7 @@

from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import add_global_metrics_labels
from vllm.entrypoints.openai.protocol import (
CompletionRequest, CompletionResponse, CompletionResponseChoice,
CompletionResponseStreamChoice, CompletionStreamResponse,
Expand Down Expand Up @@ -82,6 +85,10 @@ def parse_args():
return parser.parse_args()


app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics
app.add_route("/metrics", metrics) # Exposes HTTP metrics


def create_error_response(status_code: HTTPStatus,
message: str) -> JSONResponse:
return JSONResponse(ErrorResponse(message=message,
Expand Down Expand Up @@ -722,6 +729,9 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
trust_remote_code=engine_model_config.trust_remote_code)
load_chat_template(args, tokenizer)

# Register labels for metrics
add_global_metrics_labels(model_name=engine_args.model)

uvicorn.run(app,
host=args.host,
port=args.port,
Expand Down