From 20f77fb3343562adc18e629bba1b714b809ec868 Mon Sep 17 00:00:00 2001 From: Yessen Kanapin Date: Fri, 20 Oct 2023 20:39:57 +0000 Subject: [PATCH 1/7] Add support for prometheus metrics Add /metrics for openai endpoint with the metrics that were already logged. --- Dockerfile | 2 +- vllm/engine/llm_engine.py | 53 +++++++++++++++++++++------ vllm/entrypoints/openai/api_server.py | 11 ++++++ 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6773689f75b2a..7575cbbadfb4a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -63,7 +63,7 @@ ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"] FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate fschat + pip install accelerate fschat aioprometheus COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c3752b11f5660..dc910e1cc5984 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -18,6 +18,12 @@ get_tokenizer) from vllm.utils import Counter +try: + from aioprometheus import Gauge + _prometheus_available = True +except ImportError: + _prometheus_available = False + if ray: from ray.air.util.torch_dist import init_torch_dist_process_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -28,6 +34,19 @@ logger = init_logger(__name__) _LOGGING_INTERVAL_SEC = 5 +if _prometheus_available: + gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput", + "Avg prefill throughput") + gauge_avg_generation_throughput = Gauge("vllm:avg_generation_throughput", + "Avg prefill throughput") + gauge_scheduler_running = Gauge("vllm:scheduler_running", + "Num requests running") + gauge_scheduler_swapped = Gauge("vllm:scheduler_swapped", + "Num requests swapped") + gauge_scheduler_waiting = Gauge("vllm:scheduler_waiting", + "Num requests waiting") + gauge_gpu_cache_usage = Gauge("vllm:gpu_cache_usage", "GPU KV-cache usage") + gauge_cpu_cache_usage = Gauge("vllm:cpu_cache_usage", "CPU KV-cache usage") class LLMEngine: @@ -581,8 +600,8 @@ def _log_system_stats( else: self.num_generation_tokens.append((now, num_batched_tokens)) - elapsed_time = now - self.last_logging_time - if elapsed_time < _LOGGING_INTERVAL_SEC: + should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC + if not (should_log or _prometheus_available): return # Discard the old stats. @@ -621,16 +640,26 @@ def _log_system_stats( else: cpu_cache_usage = 0.0 - logger.info("Avg prompt throughput: " - f"{avg_prompt_throughput:.1f} tokens/s, " - "Avg generation throughput: " - f"{avg_generation_throughput:.1f} tokens/s, " - f"Running: {len(self.scheduler.running)} reqs, " - f"Swapped: {len(self.scheduler.swapped)} reqs, " - f"Pending: {len(self.scheduler.waiting)} reqs, " - f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, " - f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%") - self.last_logging_time = now + if _prometheus_available: + gauge_avg_prompt_throughput.set({}, avg_prompt_throughput) + gauge_avg_generation_throughput.set({}, avg_generation_throughput) + gauge_scheduler_running.set({}, len(self.scheduler.running)) + gauge_scheduler_swapped.set({}, len(self.scheduler.swapped)) + gauge_scheduler_waiting.set({}, len(self.scheduler.waiting)) + gauge_gpu_cache_usage.set({}, gpu_cache_usage) + gauge_cpu_cache_usage.set({}, cpu_cache_usage) + + if should_log: + logger.info("Avg prompt throughput: " + f"{avg_prompt_throughput:.1f} tokens/s, " + "Avg generation throughput: " + f"{avg_generation_throughput:.1f} tokens/s, " + f"Running: {len(self.scheduler.running)} reqs, " + f"Swapped: {len(self.scheduler.swapped)} reqs, " + f"Pending: {len(self.scheduler.waiting)} reqs, " + f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, " + f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%,") + self.last_logging_time = now def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: """Decodes the new token for a sequence.""" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f336b46565553..76046e1c628e3 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -39,6 +39,13 @@ except ImportError: _fastchat_available = False +try: + from aioprometheus import MetricsMiddleware + from aioprometheus.asgi.starlette import metrics + _prometheus_available = True +except ImportError: + _prometheus_available = False + TIMEOUT_KEEP_ALIVE = 5 # seconds logger = init_logger(__name__) @@ -46,6 +53,10 @@ app = fastapi.FastAPI() engine = None +if _prometheus_available: + app.add_middleware(MetricsMiddleware) + app.add_route("/metrics", metrics) + def create_error_response(status_code: HTTPStatus, message: str) -> JSONResponse: From fcb95ba7d1df0a918aee4ad5080b0a39a0dd7343 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 30 Nov 2023 21:24:52 +0000 Subject: [PATCH 2/7] use aioprometheus as default, refactor code for modularity --- docs/source/index.rst | 1 + docs/source/serving/metrics.rst | 13 ++++++ requirements.txt | 1 + vllm/engine/llm_engine.py | 62 ++++++++++----------------- vllm/engine/metrics.py | 47 ++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 19 ++++---- 6 files changed, 94 insertions(+), 49 deletions(-) create mode 100644 docs/source/serving/metrics.rst create mode 100644 vllm/engine/metrics.py diff --git a/docs/source/index.rst b/docs/source/index.rst index eb98aa6049bfb..c3a1d8f2498a2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,6 +66,7 @@ Documentation serving/run_on_sky serving/deploying_with_triton serving/deploying_with_docker + serving/metrics .. toctree:: :maxdepth: 1 diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst new file mode 100644 index 0000000000000..15e57bd3fec65 --- /dev/null +++ b/docs/source/serving/metrics.rst @@ -0,0 +1,13 @@ +Production Metrics +================== + +vLLM exposes a number of metrics that can be used to monitor the health of the +system. These metrics are exposed via the `/metrics` endpoint on the vLLM +OpenAI compatible API server. + +The following metrics are exposed: + +.. literalinclude:: ../../../vllm/engine/metrics.py + :language: python + :start-after: begin-metrics-definitions + :end-before: end-metrics-definitions diff --git a/requirements.txt b/requirements.txt index fa9eb6386ae71..a593324c3d878 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ xformers >= 0.0.22.post7 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic == 1.10.13 # Required for OpenAI server. +aioprometheus[starlette] diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dc910e1cc5984..6bf3622e63e5b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -8,6 +8,7 @@ from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.ray_utils import RayWorker, initialize_cluster, ray +from vllm.engine.metrics import record_metrics from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -18,12 +19,6 @@ get_tokenizer) from vllm.utils import Counter -try: - from aioprometheus import Gauge - _prometheus_available = True -except ImportError: - _prometheus_available = False - if ray: from ray.air.util.torch_dist import init_torch_dist_process_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -34,19 +29,6 @@ logger = init_logger(__name__) _LOGGING_INTERVAL_SEC = 5 -if _prometheus_available: - gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput", - "Avg prefill throughput") - gauge_avg_generation_throughput = Gauge("vllm:avg_generation_throughput", - "Avg prefill throughput") - gauge_scheduler_running = Gauge("vllm:scheduler_running", - "Num requests running") - gauge_scheduler_swapped = Gauge("vllm:scheduler_swapped", - "Num requests swapped") - gauge_scheduler_waiting = Gauge("vllm:scheduler_waiting", - "Num requests waiting") - gauge_gpu_cache_usage = Gauge("vllm:gpu_cache_usage", "GPU KV-cache usage") - gauge_cpu_cache_usage = Gauge("vllm:cpu_cache_usage", "CPU KV-cache usage") class LLMEngine: @@ -601,7 +583,7 @@ def _log_system_stats( self.num_generation_tokens.append((now, num_batched_tokens)) should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC - if not (should_log or _prometheus_available): + if not should_log: return # Discard the old stats. @@ -640,26 +622,26 @@ def _log_system_stats( else: cpu_cache_usage = 0.0 - if _prometheus_available: - gauge_avg_prompt_throughput.set({}, avg_prompt_throughput) - gauge_avg_generation_throughput.set({}, avg_generation_throughput) - gauge_scheduler_running.set({}, len(self.scheduler.running)) - gauge_scheduler_swapped.set({}, len(self.scheduler.swapped)) - gauge_scheduler_waiting.set({}, len(self.scheduler.waiting)) - gauge_gpu_cache_usage.set({}, gpu_cache_usage) - gauge_cpu_cache_usage.set({}, cpu_cache_usage) - - if should_log: - logger.info("Avg prompt throughput: " - f"{avg_prompt_throughput:.1f} tokens/s, " - "Avg generation throughput: " - f"{avg_generation_throughput:.1f} tokens/s, " - f"Running: {len(self.scheduler.running)} reqs, " - f"Swapped: {len(self.scheduler.swapped)} reqs, " - f"Pending: {len(self.scheduler.waiting)} reqs, " - f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, " - f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%,") - self.last_logging_time = now + record_metrics( + avg_prompt_throughput=avg_prompt_throughput, + avg_generation_throughput=avg_generation_throughput, + scheduler_running=len(self.scheduler.running), + scheduler_swapped=len(self.scheduler.swapped), + scheduler_waiting=len(self.scheduler.waiting), + gpu_cache_usage=gpu_cache_usage, + cpu_cache_usage=cpu_cache_usage, + ) + + logger.info("Avg prompt throughput: " + f"{avg_prompt_throughput:.1f} tokens/s, " + "Avg generation throughput: " + f"{avg_generation_throughput:.1f} tokens/s, " + f"Running: {len(self.scheduler.running)} reqs, " + f"Swapped: {len(self.scheduler.swapped)} reqs, " + f"Pending: {len(self.scheduler.waiting)} reqs, " + f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, " + f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%,") + self.last_logging_time = now def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: """Decodes the new token for a sequence.""" diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py new file mode 100644 index 0000000000000..1850c0e1d8000 --- /dev/null +++ b/vllm/engine/metrics.py @@ -0,0 +1,47 @@ +from aioprometheus import Gauge + +# The begin-* and end* here are used by the documentation generator +# to extract the metrics definitions. + +# begin-metrics-definitions +gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s", + "Average prefill throughput in tokens/s.") +gauge_avg_generation_throughput = Gauge( + "vllm:avg_generation_throughput_toks_per_s", + "Average generation throughput in tokens/s.") +gauge_scheduler_running = Gauge( + "vllm:num_requests_running", + "Number of requests that is currently running for inference.") +gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped", + "Number requests swapped to CPU.") +gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", + "Number of requests waiting to be processed.") +gauge_gpu_cache_usage = Gauge( + "vllm:gpu_cache_usage_perc", + "GPU KV-cache usage. 1 means 100 percent usage.") +gauge_cpu_cache_usage = Gauge( + "vllm:cpu_cache_usage_perc", + "CPU KV-cache usage. 1 means 100 percent usage.") +# end-metrics-definitions + +labels = {} + +def add_global_metrics_labels(**kwargs): + labels.update(kwargs) + +def record_metrics( + avg_prompt_throughput, + avg_generation_throughput, + scheduler_running, + scheduler_swapped, + scheduler_waiting, + gpu_cache_usage, + cpu_cache_usage, +): + gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput) + gauge_avg_generation_throughput.set(labels, avg_generation_throughput) + gauge_scheduler_running.set(labels, scheduler_running) + gauge_scheduler_swapped.set(labels, scheduler_swapped) + gauge_scheduler_waiting.set(labels, scheduler_waiting) + gauge_gpu_cache_usage.set(labels, gpu_cache_usage) + gauge_cpu_cache_usage.set(labels, cpu_cache_usage) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 76046e1c628e3..cd8dcaccf198e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -18,6 +18,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.engine.metrics import add_global_metrics_labels from vllm.entrypoints.openai.protocol import ( CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, @@ -39,12 +40,8 @@ except ImportError: _fastchat_available = False -try: - from aioprometheus import MetricsMiddleware - from aioprometheus.asgi.starlette import metrics - _prometheus_available = True -except ImportError: - _prometheus_available = False +from aioprometheus import MetricsMiddleware +from aioprometheus.asgi.starlette import metrics TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -53,9 +50,8 @@ app = fastapi.FastAPI() engine = None -if _prometheus_available: - app.add_middleware(MetricsMiddleware) - app.add_route("/metrics", metrics) +app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics +app.add_route("/metrics", metrics) # Exposes HTTP metrics def create_error_response(status_code: HTTPStatus, @@ -640,6 +636,11 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: tokenizer_mode=engine_args.tokenizer_mode, trust_remote_code=engine_args.trust_remote_code) + # Register labels for metrics + add_global_metrics_labels( + model_name=engine_args.model, + ) + uvicorn.run(app, host=args.host, port=args.port, From 2c243da52c7e5e20c1a958a383e73c1c94ede1db Mon Sep 17 00:00:00 2001 From: simon-mo Date: Thu, 30 Nov 2023 21:39:47 +0000 Subject: [PATCH 3/7] format code --- vllm/engine/metrics.py | 4 ++++ vllm/entrypoints/openai/api_server.py | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 1850c0e1d8000..66050a109235d 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -9,6 +9,7 @@ gauge_avg_generation_throughput = Gauge( "vllm:avg_generation_throughput_toks_per_s", "Average generation throughput in tokens/s.") + gauge_scheduler_running = Gauge( "vllm:num_requests_running", "Number of requests that is currently running for inference.") @@ -16,6 +17,7 @@ "Number requests swapped to CPU.") gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", "Number of requests waiting to be processed.") + gauge_gpu_cache_usage = Gauge( "vllm:gpu_cache_usage_perc", "GPU KV-cache usage. 1 means 100 percent usage.") @@ -26,9 +28,11 @@ labels = {} + def add_global_metrics_labels(**kwargs): labels.update(kwargs) + def record_metrics( avg_prompt_throughput, avg_generation_throughput, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index cd8dcaccf198e..a0b74b974df48 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -637,9 +637,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: trust_remote_code=engine_args.trust_remote_code) # Register labels for metrics - add_global_metrics_labels( - model_name=engine_args.model, - ) + add_global_metrics_labels(model_name=engine_args.model, ) uvicorn.run(app, host=args.host, From 37c902d6f4ed5ef1af9b89b216de7a6ca0314908 Mon Sep 17 00:00:00 2001 From: simon-mo Date: Sat, 2 Dec 2023 06:03:45 +0000 Subject: [PATCH 4/7] lint --- vllm/entrypoints/openai/api_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6ab74ebf29c79..efa4399b7a430 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -85,6 +85,7 @@ def parse_args(): parser = AsyncEngineArgs.add_cli_args(parser) return parser.parse_args() + app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics app.add_route("/metrics", metrics) # Exposes HTTP metrics From c3296bd84b7ba42cf16b7c54c9170450393f385c Mon Sep 17 00:00:00 2001 From: simon-mo Date: Sat, 2 Dec 2023 06:10:13 +0000 Subject: [PATCH 5/7] remove aioprometheus in dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index b5d7680e6cf8a..7fd8933957334 100644 --- a/Dockerfile +++ b/Dockerfile @@ -68,7 +68,7 @@ ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"] FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate fschat aioprometheus + pip install accelerate fschat COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm From e48c54146d566e46f6504d606eae8907800c4b6e Mon Sep 17 00:00:00 2001 From: simon-mo Date: Sat, 2 Dec 2023 15:24:07 -0800 Subject: [PATCH 6/7] comments --- vllm/engine/llm_engine.py | 4 ++-- vllm/engine/metrics.py | 14 +++++++------- vllm/entrypoints/openai/api_server.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d5fdba0d835c5..2400dd53d923c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -7,8 +7,8 @@ SchedulerConfig) from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs -from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray from vllm.engine.metrics import record_metrics +from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -650,7 +650,7 @@ def _log_system_stats( f"Swapped: {len(self.scheduler.swapped)} reqs, " f"Pending: {len(self.scheduler.waiting)} reqs, " f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, " - f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%,") + f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%") self.last_logging_time = now def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 66050a109235d..c64071207f6a0 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -34,13 +34,13 @@ def add_global_metrics_labels(**kwargs): def record_metrics( - avg_prompt_throughput, - avg_generation_throughput, - scheduler_running, - scheduler_swapped, - scheduler_waiting, - gpu_cache_usage, - cpu_cache_usage, + avg_prompt_throughput: float, + avg_generation_throughput: float, + scheduler_running: int, + scheduler_swapped: int, + scheduler_waiting: int, + gpu_cache_usage: float, + cpu_cache_usage: float, ): gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput) gauge_avg_generation_throughput.set(labels, avg_generation_throughput) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index efa4399b7a430..1778074d160a4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -731,7 +731,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: load_chat_template(args, tokenizer) # Register labels for metrics - add_global_metrics_labels(model_name=engine_args.model, ) + add_global_metrics_labels(model_name=engine_args.model) uvicorn.run(app, host=args.host, From 604130f88dc85102133e89db94b7345605b01a1b Mon Sep 17 00:00:00 2001 From: simon-mo Date: Sat, 2 Dec 2023 16:07:28 -0800 Subject: [PATCH 7/7] import --- vllm/entrypoints/openai/api_server.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1778074d160a4..39ea750aa9dc1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -9,6 +9,8 @@ from http import HTTPStatus from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union +from aioprometheus import MetricsMiddleware +from aioprometheus.asgi.starlette import metrics import fastapi import uvicorn from fastapi import Request @@ -32,9 +34,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import random_uuid -from aioprometheus import MetricsMiddleware -from aioprometheus.asgi.starlette import metrics - TIMEOUT_KEEP_ALIVE = 5 # seconds logger = init_logger(__name__)