From cfa8c2bfd6fd5836f0c8a3750131890a54834fb2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 22:06:38 +0000 Subject: [PATCH 01/67] added code Signed-off-by: rshaw@neuralmagic.com --- benchmarks/benchmark_serving.py | 2 +- .../openai_chat_completion_client.py | 2 +- vllm/v1/core/scheduler.py | 21 ++++++--- vllm/v1/engine/__init__.py | 3 ++ vllm/v1/engine/async_llm.py | 45 ++++++++++-------- vllm/v1/engine/core.py | 47 +++++-------------- vllm/v1/engine/core_client.py | 40 +++++----------- vllm/v1/engine/llm_engine.py | 6 +-- vllm/v1/metrics/__init__.py | 0 vllm/v1/metrics/loggers.py | 39 +++++++++++++++ vllm/v1/metrics/stats.py | 20 ++++++++ 11 files changed, 134 insertions(+), 91 deletions(-) create mode 100644 vllm/v1/metrics/__init__.py create mode 100644 vllm/v1/metrics/loggers.py create mode 100644 vllm/v1/metrics/stats.py diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4eb0e1f8ac903..7698e7f50120c 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -791,7 +791,7 @@ def main(args: argparse.Namespace): base_url = f"http://{args.host}:{args.port}" tokenizer = get_tokenizer(tokenizer_id, - tokenizer_mode=tokenizer_mode, + # tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) if args.dataset is not None: diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index bbada3891bd19..a7925f345709a 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -2,7 +2,7 @@ # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" +openai_api_base = "http://localhost:8001/v1" client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index b26716f5c02e6..6d280a53802dd 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -8,7 +8,8 @@ from vllm.sampling_params import SamplingParams from vllm.v1.core.encoder_cache_manager import EncoderCacheManager from vllm.v1.core.kv_cache_manager import KVCacheManager -from vllm.v1.engine import EngineCoreOutput +from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs +from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus @@ -394,12 +395,12 @@ def update_from_output( self, scheduler_output: "SchedulerOutput", model_runner_output: "ModelRunnerOutput", - ) -> List[EngineCoreOutput]: + ) -> EngineCoreOutputs: # NOTE(woosuk): This method doesn't consider speculative decoding. sampled_token_ids = model_runner_output.sampled_token_ids num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] - engine_core_outputs: List[EngineCoreOutput] = [] + outputs: List[EngineCoreOutput] = [] for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -438,7 +439,7 @@ def update_from_output( finished=request.is_finished(), finish_reason=request.get_finished_reason(), stop_reason=request.stop_reason) - engine_core_outputs.append(output) + outputs.append(output) # Breakout of the loop. if stopped: @@ -446,7 +447,10 @@ def update_from_output( new_running.append(request) self.running = new_running - return engine_core_outputs + return EngineCoreOutputs( + outputs=outputs, + scheduler_stats=self.make_stats(), + ) def _check_stop(self, request: Request) -> bool: if (request.num_tokens >= self.max_model_len @@ -514,7 +518,12 @@ def get_num_unfinished_requests(self) -> int: def has_unfinished_requests(self) -> bool: return self.get_num_unfinished_requests() > 0 - + + def make_stats(self) -> SchedulerStats: + return SchedulerStats( + num_running_reqs=len(self.running), + num_waiting_reqs=len(self.waiting), + ) @dataclass class NewRequestData: diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 5e3c5e327ef63..4fb2a310064f5 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -4,6 +4,8 @@ import msgspec +from vllm.v1.metrics.stats import SchedulerStats + if TYPE_CHECKING: from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs @@ -56,6 +58,7 @@ class EngineCoreOutputs( # [num_reqs] outputs: List[EngineCoreOutput] + scheduler_stats: SchedulerStats @dataclass diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 5daae45dee85c..33955a8c4acb9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -4,9 +4,8 @@ from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.metrics_types import StatLoggerBase from vllm.engine.protocol import EngineClient -from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType +from vllm.inputs import INPUT_REGISTRY, PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -21,6 +20,9 @@ from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor +from vllm.v1.metrics.loggers import StatLoggerBase +from vllm.v1.metrics.stats import SchedulerStats +from vllm.v1.metrics.loggers import LoggingStatLogger from vllm.v1.executor.abstract import Executor logger = init_logger(__name__) @@ -33,19 +35,16 @@ def __init__( vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - input_registry: InputRegistry = INPUT_REGISTRY, - use_cached_outputs: bool = False, - log_requests: bool = True, - start_engine_loop: bool = True, + log_requests: bool, ) -> None: - assert start_engine_loop - + # Logging. self.log_requests = log_requests self.log_stats = log_stats - self.stat_loggers = stat_loggers + self.stat_loggers: List[StatLoggerBase] = [ + LoggingStatLogger(), + # PrometheusStatLogger(), + ] self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -65,7 +64,7 @@ def __init__( cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, tokenizer=self.tokenizer, - input_registry=input_registry, + input_registry=INPUT_REGISTRY, ) # Detokenizer (converts EngineCoreOutputs --> RequestOutput). @@ -82,7 +81,6 @@ def __init__( asyncio_mode=True, vllm_config=vllm_config, executor_class=executor_class, - log_stats=self.log_stats, ) self.output_handler: Optional[asyncio.Task] = None @@ -92,9 +90,7 @@ def from_engine_args( cls, engine_args: AsyncEngineArgs, engine_config: Optional[VllmConfig] = None, - start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "AsyncLLM": """Create an AsyncLLM from the EngineArgs.""" @@ -112,9 +108,6 @@ def from_engine_args( executor_class=executor_class, log_requests=not engine_args.disable_log_requests, log_stats=not engine_args.disable_log_stats, - start_engine_loop=start_engine_loop, - usage_context=usage_context, - stat_loggers=stat_loggers, ) def shutdown(self): @@ -254,7 +247,7 @@ async def _run_output_handler(self): outputs = await self.engine_core.get_output_async() # 2) Detokenize based on the output. - request_outputs, reqs_to_abort = self.detokenizer.step(outputs) + request_outputs, reqs_to_abort = self.detokenizer.step(outputs.outputs) # 3) Put the RequestOutputs into the per-request queues. self._process_request_outputs(request_outputs) @@ -262,6 +255,11 @@ async def _run_output_handler(self): # 4) Abort any requests that finished due to stop strings. await self.engine_core.abort_requests_async(reqs_to_abort) + # 5) Log any stats. + await self._log_stats( + scheduler_stats=outputs.scheduler_stats + ) + except Exception as e: logger.exception("EngineCore output handler hit an error: %s", e) kill_process_tree(os.getpid()) @@ -278,6 +276,15 @@ async def abort(self, request_id: str) -> None: if request_id in self.rid_to_queue: del self.rid_to_queue[request_id] + + async def _log_stats(self, scheduler_stats: SchedulerStats): + """Log stats to the stat loggers.""" + if not self.log_stats: + return + + for logger in self.stat_loggers: + logger.log(scheduler_stats=scheduler_stats) + def encode( self, prompt: PromptType, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 975ce11fe8aff..6949ea129a2ae 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -19,7 +19,8 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion) + EngineCoreRequestType, EngineCoreRequestUnion, + SchedulerStats) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -28,9 +29,8 @@ logger = init_logger(__name__) -POLLING_TIMEOUT_MS = 5000 +POLLING_TIMEOUT_MS = 2500 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 5 class EngineCore: @@ -40,10 +40,8 @@ def __init__( self, vllm_config: VllmConfig, executor_class: Type[Executor], - log_stats: bool = False, ): assert vllm_config.model_config.runner_type != "pooling" - self.log_stats = log_stats logger.info("Initializing an LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) @@ -62,8 +60,6 @@ def __init__( vllm_config.cache_config, vllm_config.lora_config) - self._last_logging_time = time.time() - self.mm_input_mapper_server = MMInputMapperServer( vllm_config.model_config) @@ -114,7 +110,7 @@ def abort_requests(self, request_ids: List[str]): self.scheduler.finish_requests(request_ids, RequestStatus.FINISHED_ABORTED) - def step(self) -> List[EngineCoreOutput]: + def step(self) -> EngineCoreOutputs: """Schedule, execute, and make output.""" if not self.scheduler.has_unfinished_requests(): @@ -143,9 +139,8 @@ def __init__( ready_pipe: Connection, vllm_config: VllmConfig, executor_class: Type[Executor], - log_stats: bool = False, ): - super().__init__(vllm_config, executor_class, log_stats) + super().__init__(vllm_config, executor_class) # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, @@ -153,7 +148,7 @@ def __init__( # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue() - self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue() + self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() threading.Thread(target=self.process_input_socket, args=(input_path, ), daemon=True).start() @@ -217,7 +212,10 @@ def run_busy_loop(self): self._handle_client_request(req) break except queue.Empty: - self._log_stats() + # Push out most recent scheduler stats to client. + stats = self.scheduler.make_stats() + self.output_queue.put_nowait(EngineCoreOutputs( + outputs=[], scheduler_stats=stats)) logger.debug("EngineCore busy loop waiting.") except BaseException: raise @@ -230,27 +228,9 @@ def run_busy_loop(self): # 3) Step the engine core. outputs = self.step() - # 4) Put EngineCoreOutputs into the output queue. + # 5) Put EngineCoreOutputs into the output queue. self.output_queue.put_nowait(outputs) - - self._log_stats() - - def _log_stats(self): - """Log basic stats every LOGGING_TIME_S""" - - if not self.log_stats: - return - - now = time.time() - - if now - self._last_logging_time > LOGGING_TIME_S: - logger.info( - "RUNNING: %s | WAITING: %s", - len(self.scheduler.running), - len(self.scheduler.waiting), - ) - - self._last_logging_time = now + def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: """Handle EngineCoreRequest or EngineCoreABORT from Client.""" @@ -301,7 +281,6 @@ def process_output_socket(self, output_path: str): with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: - engine_core_outputs = self.output_queue.get() - outputs = EngineCoreOutputs(outputs=engine_core_outputs) + outputs = self.output_queue.get() encoder.encode_into(outputs, buffer) socket.send_multipart((buffer, ), copy=False) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index a4a45ae05ff9e..bb51a52d52dc8 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -12,7 +12,7 @@ from vllm.logger import init_logger from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, make_zmq_socket) -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) from vllm.v1.engine.core import EngineCore, EngineCoreProc @@ -40,7 +40,6 @@ def make_client( asyncio_mode: bool, vllm_config: VllmConfig, executor_class: Type[Executor], - log_stats: bool = False, ) -> "EngineCoreClient": # TODO: support this for debugging purposes. @@ -50,18 +49,18 @@ def make_client( "is not currently supported.") if multiprocess_mode and asyncio_mode: - return AsyncMPClient(vllm_config, executor_class, log_stats) + return AsyncMPClient(vllm_config, executor_class) if multiprocess_mode and not asyncio_mode: - return SyncMPClient(vllm_config, executor_class, log_stats) + return SyncMPClient(vllm_config, executor_class) - return InprocClient(vllm_config, executor_class, log_stats) + return InprocClient(vllm_config, executor_class) @abstractmethod def shutdown(self): ... - def get_output(self) -> List[EngineCoreOutput]: + def get_output(self) -> EngineCoreOutputs: raise NotImplementedError def add_request(self, request: EngineCoreRequest) -> None: @@ -73,7 +72,7 @@ def profile(self, is_start: bool = True) -> None: def abort_requests(self, request_ids: List[str]) -> None: raise NotImplementedError - async def get_output_async(self) -> List[EngineCoreOutput]: + async def get_output_async(self) -> EngineCoreOutputs: raise NotImplementedError async def add_request_async(self, request: EngineCoreRequest) -> None: @@ -99,7 +98,7 @@ class InprocClient(EngineCoreClient): def __init__(self, *args, **kwargs): self.engine_core = EngineCore(*args, **kwargs) - def get_output(self) -> List[EngineCoreOutput]: + def get_output(self) -> EngineCoreOutputs: return self.engine_core.step() def add_request(self, request: EngineCoreRequest) -> None: @@ -133,7 +132,6 @@ def __init__( asyncio_mode: bool, vllm_config: VllmConfig, executor_class: Type[Executor], - log_stats: bool = False, ): # The child processes will send SIGUSR1 when unrecoverable # errors happen. We kill the process tree here so that the @@ -180,7 +178,6 @@ def sigusr1_handler(signum, frame): process_kwargs={ "vllm_config": vllm_config, "executor_class": executor_class, - "log_stats": log_stats, }) def shutdown(self): @@ -194,22 +191,17 @@ def shutdown(self): class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" - def __init__(self, - vllm_config: VllmConfig, - executor_class: Type[Executor], - log_stats: bool = False): + def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): super().__init__( asyncio_mode=False, vllm_config=vllm_config, executor_class=executor_class, - log_stats=log_stats, ) - def get_output(self) -> List[EngineCoreOutput]: + def get_output(self) -> EngineCoreOutputs: (frame, ) = self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frame.buffer).outputs - return engine_core_outputs + return self.decoder.decode(frame.buffer) def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -232,23 +224,17 @@ def profile(self, is_start: bool = True) -> None: class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, - vllm_config: VllmConfig, - executor_class: Type[Executor], - log_stats: bool = False): + def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): super().__init__( asyncio_mode=True, vllm_config=vllm_config, executor_class=executor_class, - log_stats=log_stats, ) - async def get_output_async(self) -> List[EngineCoreOutput]: + async def get_output_async(self) -> EngineCoreOutputs: frames = await self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs - - return engine_core_outputs + return self.decoder.decode(frames[0].buffer) async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 8ced3a34d2da3..7a75623aa5bf1 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -42,6 +42,7 @@ def __init__( use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: + assert log_stats is False self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -74,7 +75,6 @@ def __init__( asyncio_mode=False, vllm_config=vllm_config, executor_class=executor_class, - log_stats=False, ) @classmethod @@ -147,11 +147,11 @@ def add_request( def step(self) -> List[RequestOutput]: # 1) Get EngineCoreOutput from the EngineCore. - engine_core_outputs = self.engine_core.get_output() + outputs = self.engine_core.get_output() # 2) Detokenizer the EngineCoreOutput. request_outputs, requests_to_abort = self.detokenizer.step( - engine_core_outputs) + outputs.outputs) # 3) Abort requests that finished due to stopping criteria. if requests_to_abort: diff --git a/vllm/v1/metrics/__init__.py b/vllm/v1/metrics/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py new file mode 100644 index 0000000000000..660823dc402a5 --- /dev/null +++ b/vllm/v1/metrics/loggers.py @@ -0,0 +1,39 @@ +from abc import ABC, abstractmethod +import time + +from vllm.logger import init_logger +from vllm.v1.metrics.stats import SchedulerStats + +logger = init_logger(__name__) + +_LOCAL_LOGGING_INTERVAL_SEC = 5.0 + + +class StatLoggerBase(ABC): + + @abstractmethod + def log(self, scheduler_stats: SchedulerStats): + ... + + +class LoggingStatLogger(StatLoggerBase): + + def __init__(self): + self.last_log_time = time.monotonic() + + def log(self, scheduler_stats: SchedulerStats): + """Log Stats to standard output.""" + + # Log every _LOCAL_LOGGING_INTERVAL_SEC. + now = time.monotonic() + if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC: + return + self.last_log_time = now + + # Format and print output. + logger.info( + "Running: %d reqs, Waiting: %d reqs ", + scheduler_stats.num_running_reqs, + scheduler_stats.num_waiting_reqs, + ) + \ No newline at end of file diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py new file mode 100644 index 0000000000000..6690a4aa6252f --- /dev/null +++ b/vllm/v1/metrics/stats.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass + +@dataclass +class SchedulerStats: + """Stats associated with the scheduler.""" + + num_running_reqs: int = 0 + num_waiting_reqs: int = 0 + + # gpu_cache_usage: float = 0.0 + # gpu_prefix_cache_hit_rate: float = 0.0 + +@dataclass +class RequestStats: + """Stats associated with a request.""" + pass + +@dataclass +class Stats: + scheduler: SchedulerStats \ No newline at end of file From 6d8e4f300b5a5f37b050fd5105d53a9848d400d4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 22:09:09 +0000 Subject: [PATCH 02/67] fixed --- benchmarks/benchmark_serving.py | 2 +- vllm/v1/metrics/stats.py | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 7698e7f50120c..4eb0e1f8ac903 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -791,7 +791,7 @@ def main(args: argparse.Namespace): base_url = f"http://{args.host}:{args.port}" tokenizer = get_tokenizer(tokenizer_id, - # tokenizer_mode=tokenizer_mode, + tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) if args.dataset is not None: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 6690a4aa6252f..c456cc6915692 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -9,12 +9,3 @@ class SchedulerStats: # gpu_cache_usage: float = 0.0 # gpu_prefix_cache_hit_rate: float = 0.0 - -@dataclass -class RequestStats: - """Stats associated with a request.""" - pass - -@dataclass -class Stats: - scheduler: SchedulerStats \ No newline at end of file From c78a56f92bbbbcdfbc632b5c5458993cbbfec2e3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 22:09:44 +0000 Subject: [PATCH 03/67] fixed --- examples/online_serving/openai_chat_completion_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index a7925f345709a..bbada3891bd19 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -2,7 +2,7 @@ # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8001/v1" +openai_api_base = "http://localhost:8000/v1" client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") From 7b397057ec58fb37419886f974d7c6deefba4036 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 22:25:34 +0000 Subject: [PATCH 04/67] updated --- vllm/v1/core/scheduler.py | 3 ++- vllm/v1/engine/async_llm.py | 15 ++++++--------- vllm/v1/engine/core.py | 21 +++++++++++---------- vllm/v1/engine/core_client.py | 16 +++++++++++----- vllm/v1/metrics/loggers.py | 5 ++--- vllm/v1/metrics/stats.py | 1 + 6 files changed, 33 insertions(+), 28 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 6d280a53802dd..f04e529891287 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -518,13 +518,14 @@ def get_num_unfinished_requests(self) -> int: def has_unfinished_requests(self) -> bool: return self.get_num_unfinished_requests() > 0 - + def make_stats(self) -> SchedulerStats: return SchedulerStats( num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), ) + @dataclass class NewRequestData: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 33955a8c4acb9..7cf05ca9bed45 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -20,10 +20,9 @@ from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor -from vllm.v1.metrics.loggers import StatLoggerBase -from vllm.v1.metrics.stats import SchedulerStats -from vllm.v1.metrics.loggers import LoggingStatLogger from vllm.v1.executor.abstract import Executor +from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase +from vllm.v1.metrics.stats import SchedulerStats logger = init_logger(__name__) @@ -247,7 +246,8 @@ async def _run_output_handler(self): outputs = await self.engine_core.get_output_async() # 2) Detokenize based on the output. - request_outputs, reqs_to_abort = self.detokenizer.step(outputs.outputs) + request_outputs, reqs_to_abort = self.detokenizer.step( + outputs.outputs) # 3) Put the RequestOutputs into the per-request queues. self._process_request_outputs(request_outputs) @@ -256,9 +256,7 @@ async def _run_output_handler(self): await self.engine_core.abort_requests_async(reqs_to_abort) # 5) Log any stats. - await self._log_stats( - scheduler_stats=outputs.scheduler_stats - ) + await self._log_stats(scheduler_stats=outputs.scheduler_stats) except Exception as e: logger.exception("EngineCore output handler hit an error: %s", e) @@ -276,12 +274,11 @@ async def abort(self, request_id: str) -> None: if request_id in self.rid_to_queue: del self.rid_to_queue[request_id] - async def _log_stats(self, scheduler_stats: SchedulerStats): """Log stats to the stat loggers.""" if not self.log_stats: return - + for logger in self.stat_loggers: logger.log(scheduler_stats=scheduler_stats) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6949ea129a2ae..84dbe248e4d53 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -17,10 +17,9 @@ maybe_register_config_serialize_by_value) from vllm.utils import get_exception_traceback, zmq_socket_ctx from vllm.v1.core.scheduler import Scheduler -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion, - SchedulerStats) +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -114,7 +113,8 @@ def step(self) -> EngineCoreOutputs: """Schedule, execute, and make output.""" if not self.scheduler.has_unfinished_requests(): - return [] + return EngineCoreOutputs( + outputs=[], scheduler_stats=self.scheduler.make_stats()) scheduler_output = self.scheduler.schedule() output = self.model_executor.execute_model(scheduler_output) @@ -139,9 +139,12 @@ def __init__( ready_pipe: Connection, vllm_config: VllmConfig, executor_class: Type[Executor], + log_stats: bool = False, ): super().__init__(vllm_config, executor_class) + self.log_stats = log_stats + # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, # and to overlap some serialization/deserialization with the @@ -212,10 +215,9 @@ def run_busy_loop(self): self._handle_client_request(req) break except queue.Empty: - # Push out most recent scheduler stats to client. - stats = self.scheduler.make_stats() - self.output_queue.put_nowait(EngineCoreOutputs( - outputs=[], scheduler_stats=stats)) + # Break out the loops so we can log_stats via step(). + if self.log_stats: + break logger.debug("EngineCore busy loop waiting.") except BaseException: raise @@ -230,7 +232,6 @@ def run_busy_loop(self): # 5) Put EngineCoreOutputs into the output queue. self.output_queue.put_nowait(outputs) - def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: """Handle EngineCoreRequest or EngineCoreABORT from Client.""" diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index bb51a52d52dc8..176fa839c8f58 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -12,9 +12,9 @@ from vllm.logger import init_logger from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, make_zmq_socket) -from vllm.v1.engine import (EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion) +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder @@ -132,6 +132,7 @@ def __init__( asyncio_mode: bool, vllm_config: VllmConfig, executor_class: Type[Executor], + log_stats: bool, ): # The child processes will send SIGUSR1 when unrecoverable # errors happen. We kill the process tree here so that the @@ -178,6 +179,7 @@ def sigusr1_handler(signum, frame): process_kwargs={ "vllm_config": vllm_config, "executor_class": executor_class, + "log_stats": log_stats, }) def shutdown(self): @@ -191,11 +193,13 @@ def shutdown(self): class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): + def __init__(self, vllm_config: VllmConfig, + executor_class: Type[Executor]): super().__init__( asyncio_mode=False, vllm_config=vllm_config, executor_class=executor_class, + log_stats=False, ) def get_output(self) -> EngineCoreOutputs: @@ -224,11 +228,13 @@ def profile(self, is_start: bool = True) -> None: class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): + def __init__(self, vllm_config: VllmConfig, + executor_class: Type[Executor]): super().__init__( asyncio_mode=True, vllm_config=vllm_config, executor_class=executor_class, + log_stats=True, ) async def get_output_async(self) -> EngineCoreOutputs: diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 660823dc402a5..8feeef17542e6 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,5 +1,5 @@ -from abc import ABC, abstractmethod import time +from abc import ABC, abstractmethod from vllm.logger import init_logger from vllm.v1.metrics.stats import SchedulerStats @@ -10,7 +10,7 @@ class StatLoggerBase(ABC): - + @abstractmethod def log(self, scheduler_stats: SchedulerStats): ... @@ -36,4 +36,3 @@ def log(self, scheduler_stats: SchedulerStats): scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, ) - \ No newline at end of file diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index c456cc6915692..5ebb4fd5b37db 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,5 +1,6 @@ from dataclasses import dataclass + @dataclass class SchedulerStats: """Stats associated with the scheduler.""" From 6e9cd1cd9177c1ba00e4e207ff9b8991b233b4d9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 22:28:41 +0000 Subject: [PATCH 05/67] updated --- vllm/v1/engine/async_llm.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7cf05ca9bed45..b109e5e34ea7b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -5,7 +5,7 @@ from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient -from vllm.inputs import INPUT_REGISTRY, PromptType +from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -34,7 +34,11 @@ def __init__( vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool, - log_requests: bool, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + input_registry: InputRegistry = INPUT_REGISTRY, + use_cached_outputs: bool = False, + log_requests: bool = True, + start_engine_loop: bool = True, ) -> None: # Logging. @@ -63,7 +67,7 @@ def __init__( cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, tokenizer=self.tokenizer, - input_registry=INPUT_REGISTRY, + input_registry=input_registry, ) # Detokenizer (converts EngineCoreOutputs --> RequestOutput). @@ -89,6 +93,7 @@ def from_engine_args( cls, engine_args: AsyncEngineArgs, engine_config: Optional[VllmConfig] = None, + start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, ) -> "AsyncLLM": """Create an AsyncLLM from the EngineArgs.""" @@ -107,6 +112,8 @@ def from_engine_args( executor_class=executor_class, log_requests=not engine_args.disable_log_requests, log_stats=not engine_args.disable_log_stats, + start_engine_loop=start_engine_loop, + usage_context=usage_context, ) def shutdown(self): From 2657b7f96dec516622a48e0d3b369b6920191624 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 22:29:09 +0000 Subject: [PATCH 06/67] fixed --- vllm/v1/engine/async_llm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b109e5e34ea7b..247c5bc966255 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -41,7 +41,8 @@ def __init__( start_engine_loop: bool = True, ) -> None: - # Logging. + assert start_engine_loop + self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers: List[StatLoggerBase] = [ From 249b9ff1eb8288f3fda1f05c9d54962a08787014 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 22:30:08 +0000 Subject: [PATCH 07/67] updated --- vllm/v1/engine/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 84dbe248e4d53..d65c1bee7643d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -28,8 +28,7 @@ logger = init_logger(__name__) -POLLING_TIMEOUT_MS = 2500 -POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 +POLLING_TIMEOUT_S = 2.5 class EngineCore: From c1f9292e7a14be2b07d392fe425f41e4f68984ab Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 11 Jan 2025 23:10:27 +0000 Subject: [PATCH 08/67] refactoring metrics --- vllm/v1/engine/async_llm.py | 24 +-- vllm/v1/engine/detokenizer.py | 273 ---------------------------------- vllm/v1/engine/llm_engine.py | 13 +- vllm/v1/metrics/loggers.py | 1 - vllm/v1/metrics/stats.py | 8 + 5 files changed, 27 insertions(+), 292 deletions(-) delete mode 100644 vllm/v1/engine/detokenizer.py diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 247c5bc966255..013140bf6cd40 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -18,7 +18,7 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient -from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase @@ -72,7 +72,7 @@ def __init__( ) # Detokenizer (converts EngineCoreOutputs --> RequestOutput). - self.detokenizer = Detokenizer( + self.output_processor = OutputProcessor( tokenizer_name=vllm_config.model_config.tokenizer, tokenizer_mode=vllm_config.model_config.tokenizer_mode, trust_remote_code=vllm_config.model_config.trust_remote_code, @@ -152,7 +152,7 @@ async def add_request( priority) # 3) Add the request to Detokenizer (this process). - self.detokenizer.add_request(request) + self.output_processor.add_request(request) # 4) Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(request) @@ -251,20 +251,22 @@ async def _run_output_handler(self): try: while True: # 1) Pull EngineCoreOutput from the EngineCore. - outputs = await self.engine_core.get_output_async() + engine_core_outputs = await self.engine_core.get_output_async() # 2) Detokenize based on the output. - request_outputs, reqs_to_abort = self.detokenizer.step( - outputs.outputs) + processed_outputs = self.output_processor.step(engine_core_outputs.outputs) # 3) Put the RequestOutputs into the per-request queues. - self._process_request_outputs(request_outputs) + self._process_request_outputs(processed_outputs.request_outputs) # 4) Abort any requests that finished due to stop strings. - await self.engine_core.abort_requests_async(reqs_to_abort) + await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort) # 5) Log any stats. - await self._log_stats(scheduler_stats=outputs.scheduler_stats) + await self._log_stats( + scheduler_stats=engine_core_outputs.scheduler_stats, + iteration_stats=processed_outputs.iteration_stats, + ) except Exception as e: logger.exception("EngineCore output handler hit an error: %s", e) @@ -275,7 +277,7 @@ async def abort(self, request_id: str) -> None: request_ids = [request_id] await self.engine_core.abort_requests_async(request_ids) - self.detokenizer.abort_requests(request_ids) + self.output_processor.abort_requests(request_ids) # If a request finishes while we await then the request_id # will be removed from the tracked queues before we get here. @@ -315,7 +317,7 @@ async def get_tokenizer( lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: assert lora_request is None - return self.detokenizer.tokenizer + return self.output_processor.tokenizer async def is_tracing_enabled(self) -> bool: return False diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py deleted file mode 100644 index 65be9e58e03c8..0000000000000 --- a/vllm/v1/engine/detokenizer.py +++ /dev/null @@ -1,273 +0,0 @@ -from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Tuple, Union - -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.logger import init_logger -from vllm.outputs import RequestOutput -from vllm.sampling_params import RequestOutputKind -from vllm.transformers_utils.detokenizer_utils import ( - AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest - -logger = init_logger(__name__) - - -@dataclass -class IncrementalDetokenizer: - - # Generation data - output_text: str - tokens: List[str] - token_ids: List[int] - - # Stop strings - stop: List[str] - include_stop_str_in_output: bool - - # Metadata for incremental detokenization - prefix_offset: int - read_offset: int - - # Parameters for detokenization - skip_special_tokens: bool - spaces_between_special_tokens: bool - output_kind: RequestOutputKind - - # TODO: Probably decouple these - request_id: str - prompt: Optional[str] - prompt_token_ids: List[int] - - # Tokenizer for this request - tokenizer: AnyTokenizer - - # Accounting for stop string buffering - stop_buffer_length: int - _last_output_text_offset: int = 0 - - @property - def output_token_ids(self) -> List[int]: - assert len(self.token_ids) >= len(self.prompt_token_ids) - return self.token_ids[len(self.prompt_token_ids):] - - @classmethod - def from_new_request( - cls, - tokenizer: AnyTokenizer, - request: EngineCoreRequest, - ) -> "IncrementalDetokenizer": - - tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( - tokenizer=tokenizer, - prompt_ids=request.prompt_token_ids, - skip_special_tokens=request.sampling_params.skip_special_tokens, - ) - - stops = request.sampling_params.stop - # Number of chars to hold back when stop strings are to be excluded - # from streamed output. - if stops and not request.sampling_params.include_stop_str_in_output: - stop_buffer_length = max(len(s) for s in stops) - 1 - else: - stop_buffer_length = 0 - - return cls( - output_text="", - tokens=tokens, - # Detokenizer mutates this list, so need a unique copy. - # NOTE(Nick): could we take ownership of it though? - token_ids=request.prompt_token_ids.copy(), - stop=stops, - include_stop_str_in_output=request.sampling_params. - include_stop_str_in_output, - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=request.sampling_params.skip_special_tokens, - spaces_between_special_tokens=request.sampling_params. - spaces_between_special_tokens, - output_kind=request.sampling_params.output_kind, - request_id=request.request_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, - tokenizer=tokenizer, - stop_buffer_length=stop_buffer_length, - ) - - def add_tokens( - self, - new_token_ids: List[int], - finish_reason: Optional[str], - stop_reason: Optional[Union[int, str, None]], - ) -> Optional[RequestOutput]: - """ - Update RequestState for the request_id by: - 1) Detokenize the new token ids incrementally. - 2) Update the RequestOutput with the new text. - """ - - # 1) Detokenize the new token ids incrementally. - # TODO(woosuk): This method becomes very inefficient when the number of - # new_token_ids is more than 1. We need to optimize this. - decoded_text = "" - for new_token_id in new_token_ids: - self.token_ids.append(new_token_id) - (new_tokens, new_decoded_token_text, prefix_offset, - read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=self.token_ids, - prev_tokens=self.tokens, - prefix_offset=self.prefix_offset, - read_offset=self.read_offset, - skip_special_tokens=self.skip_special_tokens, - spaces_between_special_tokens=self. - spaces_between_special_tokens, - ) - - self.tokens.extend(new_tokens) - self.prefix_offset = prefix_offset - self.read_offset = read_offset - self.output_text += new_decoded_token_text - - decoded_text += new_decoded_token_text - - # 2) Evaluate stop criteria. - if self.stop: - stop = StopChecker.check_stop_strings( - output_text=self.output_text, - new_char_count=len(decoded_text), - stop=self.stop, - include_in_output=self.include_stop_str_in_output, - ) - if stop is not None: - stop_str, truncate_to = stop - if truncate_to != -1: - self.output_text = self.output_text[:truncate_to] - finish_reason = "stop" # TODO: use constant - stop_reason = stop_str - - # TODO: handle stop_token_ids here too? - - # 3) Update the RequestOutput object with the new text. - finished = bool(finish_reason) - if self.output_kind == RequestOutputKind.FINAL_ONLY \ - and not finished: - return None - - delta = self.output_kind == RequestOutputKind.DELTA - output_text = self._get_next_output_text(finished, delta) - token_ids = new_token_ids if delta else self.output_token_ids - - request_output = RequestOutput.new( - self.request_id, - self.prompt, - self.prompt_token_ids, - output_text, - token_ids, - finished, - ) - - if finished: - completion_output = request_output.outputs[0] - completion_output.finish_reason = finish_reason - completion_output.stop_reason = stop_reason - - return request_output - - def _get_next_output_text(self, finished: bool, delta: bool) -> str: - """If delta is True, only new text since the last call to - this method is returned""" - - # We return the full output text if the sequence is finished. - buffer_length = 0 if finished else self.stop_buffer_length - if not delta: - return self.output_text[:-buffer_length] if buffer_length else ( - self.output_text) - length = len(self.output_text) - buffer_length - last_offset = self._last_output_text_offset - if last_offset < length: - self._last_output_text_offset = length - return self.output_text[last_offset:length] - return "" - - -class Detokenizer: - - def __init__(self, - tokenizer_name: str, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - revision: Optional[str] = None): - # TODO: once we support LoRA, we should should pass the tokenizer - # here. We currently have two copies (this + in the LLMEngine). - self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, - tokenizer_mode=tokenizer_mode, - trust_remote_code=trust_remote_code, - revision=revision) - - # Request id -> IncrementalDetokenizer - self.request_states: Dict[str, IncrementalDetokenizer] = {} - - def is_request_active(self, request_id: str): - return request_id in self.request_states - - def get_num_unfinished_requests(self): - return len(self.request_states) - - def has_unfinished_requests(self) -> bool: - return len(self.request_states) > 0 - - def abort_requests( - self, - request_ids: Iterable[str], - ) -> None: - """Remove the request_ids from the Detokenizer.""" - - for request_id in request_ids: - self.request_states.pop(request_id, None) - - def add_request( - self, - request: EngineCoreRequest, - ): - """Add new request to the Detokenizer.""" - - assert (request.request_id not in self.request_states) - - request_state = IncrementalDetokenizer.from_new_request( - self.tokenizer, request) - self.request_states[request.request_id] = request_state - - def step( - self, encore_core_outputs: List[EngineCoreOutput] - ) -> Tuple[List[RequestOutput], List[str]]: - """Update state and request the RequestOutputs to the LLMEngine.""" - - request_outputs: List[RequestOutput] = [] - requests_to_abort: List[str] = [] - for engine_core_output in encore_core_outputs: - request_id = engine_core_output.request_id - detokenizer = self.request_states.get(request_id) - if detokenizer is None: - # Ignore output for already-aborted request. - continue - - # Detokenize and update state. - request_output = detokenizer.add_tokens( - new_token_ids=engine_core_output.new_token_ids, - finish_reason=engine_core_output.finish_reason, - stop_reason=engine_core_output.stop_reason, - ) - - if request_output is not None: - # Add to RequestOutputs list. - request_outputs.append(request_output) - - # Free completed requests. - if request_output.finished: - self.request_states.pop(request_id) - if not engine_core_output.finished: - requests_to_abort.append(request_id) - - # Return to EngineClient. - return request_outputs, requests_to_abort diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 7a75623aa5bf1..27321752c3efa 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -18,7 +18,7 @@ BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.v1.engine.core_client import EngineCoreClient -from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor @@ -42,7 +42,6 @@ def __init__( use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - assert log_stats is False self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -62,7 +61,7 @@ def __init__( mm_registry=mm_registry) # Detokenizer (converts EngineCoreOutputs --> RequestOutput) - self.detokenizer = Detokenizer( + self.output_processor = OutputProcessor( tokenizer_name=vllm_config.model_config.tokenizer, tokenizer_mode=vllm_config.model_config.tokenizer_mode, trust_remote_code=vllm_config.model_config.trust_remote_code, @@ -104,10 +103,10 @@ def from_engine_args( multiprocess_mode=enable_multiprocessing) def get_num_unfinished_requests(self) -> int: - return self.detokenizer.get_num_unfinished_requests() + return self.output_processor.get_num_unfinished_requests() def has_unfinished_requests(self) -> bool: - return self.detokenizer.has_unfinished_requests() + return self.output_processor.has_unfinished_requests() @classmethod def validate_outputs(cls, outputs, output_type): @@ -117,7 +116,7 @@ def abort_request(self, request_ids: List[str]) -> None: """Remove request_ids from EngineCore and Detokenizer.""" self.engine_core.abort_requests(request_ids) - self.detokenizer.abort_requests(request_ids) + self.output_processor.abort_requests(request_ids) def add_request( self, @@ -139,7 +138,7 @@ def add_request( priority) # 2) Add the request to Detokenizer. - self.detokenizer.add_request(request) + self.output_processor.add_request(request) # 3) Add the request to EngineCore. self.engine_core.add_request(request) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8feeef17542e6..4ed9193128647 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -22,7 +22,6 @@ def __init__(self): self.last_log_time = time.monotonic() def log(self, scheduler_stats: SchedulerStats): - """Log Stats to standard output.""" # Log every _LOCAL_LOGGING_INTERVAL_SEC. now = time.monotonic() diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 5ebb4fd5b37db..d9c7aa400005a 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -10,3 +10,11 @@ class SchedulerStats: # gpu_cache_usage: float = 0.0 # gpu_prefix_cache_hit_rate: float = 0.0 + + +@dataclass +class IterationStats: + """Stats associated with a single iteration""" + + num_generation_tokens: int = 0 + num_prompt_tokens: int = 0 From c641866d2454a5d2e7ec55a38eeb9b51e94fb4ea Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 14:12:00 +0000 Subject: [PATCH 09/67] updated --- vllm/v1/engine/llm_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 7a75623aa5bf1..ac392f5e4f4cf 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -42,7 +42,6 @@ def __init__( use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - assert log_stats is False self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). From 1ce7a5fcf75f4a4eec620ed50189e06b5431ac47 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 14:13:37 +0000 Subject: [PATCH 10/67] updated --- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 247c5bc966255..e0ceb59dffcbd 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -47,7 +47,7 @@ def __init__( self.log_stats = log_stats self.stat_loggers: List[StatLoggerBase] = [ LoggingStatLogger(), - # PrometheusStatLogger(), + # TODO(rob): PrometheusStatLogger(), ] self.model_config = vllm_config.model_config diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index d65c1bee7643d..e7f90d3c62142 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -214,10 +214,10 @@ def run_busy_loop(self): self._handle_client_request(req) break except queue.Empty: - # Break out the loops so we can log_stats via step(). + logger.debug("EngineCore busy loop waiting.") + # Break out the loop so we can log_stats in step(). if self.log_stats: break - logger.debug("EngineCore busy loop waiting.") except BaseException: raise From f8de299c7c741dbbec83abbf5132b76bc7fda701 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 14:16:36 +0000 Subject: [PATCH 11/67] added output processor --- vllm/v1/engine/output_processor.py | 286 +++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 vllm/v1/engine/output_processor.py diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py new file mode 100644 index 0000000000000..e2fcf29cf4915 --- /dev/null +++ b/vllm/v1/engine/output_processor.py @@ -0,0 +1,286 @@ +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple, Union + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import RequestOutputKind +from vllm.transformers_utils.detokenizer_utils import ( + AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.metrics.stats import IterationStats + +logger = init_logger(__name__) + +@dataclass +class OutputProcessorOutput: + """Output from the OutputProcessor.""" + + request_outputs: List[RequestOutput] + requests_to_abort: List[str] + iteration_stats: IterationStats + +@dataclass +class IncrementalDetokenizer: + + # Generation data + output_text: str + tokens: List[str] + token_ids: List[int] + + # Stop strings + stop: List[str] + include_stop_str_in_output: bool + + # Metadata for incremental detokenization + prefix_offset: int + read_offset: int + + # Parameters for detokenization + skip_special_tokens: bool + spaces_between_special_tokens: bool + output_kind: RequestOutputKind + + # TODO: Probably decouple these + request_id: str + prompt: Optional[str] + prompt_token_ids: List[int] + + # Tokenizer for this request + tokenizer: AnyTokenizer + + # Accounting for stop string buffering + stop_buffer_length: int + _last_output_text_offset: int = 0 + + @property + def output_token_ids(self) -> List[int]: + assert len(self.token_ids) >= len(self.prompt_token_ids) + return self.token_ids[len(self.prompt_token_ids):] + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: EngineCoreRequest, + ) -> "IncrementalDetokenizer": + + tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=request.prompt_token_ids, + skip_special_tokens=request.sampling_params.skip_special_tokens, + ) + + stops = request.sampling_params.stop + # Number of chars to hold back when stop strings are to be excluded + # from streamed output. + if stops and not request.sampling_params.include_stop_str_in_output: + stop_buffer_length = max(len(s) for s in stops) - 1 + else: + stop_buffer_length = 0 + + return cls( + output_text="", + tokens=tokens, + # Detokenizer mutates this list, so need a unique copy. + # NOTE(Nick): could we take ownership of it though? + token_ids=request.prompt_token_ids.copy(), + stop=stops, + include_stop_str_in_output=request.sampling_params. + include_stop_str_in_output, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=request.sampling_params.skip_special_tokens, + spaces_between_special_tokens=request.sampling_params. + spaces_between_special_tokens, + output_kind=request.sampling_params.output_kind, + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + tokenizer=tokenizer, + stop_buffer_length=stop_buffer_length, + ) + + def add_tokens( + self, + new_token_ids: List[int], + finish_reason: Optional[str], + stop_reason: Optional[Union[int, str, None]], + ) -> Optional[RequestOutput]: + """ + Update RequestState for the request_id by: + 1) Detokenize the new token ids incrementally. + 2) Update the RequestOutput with the new text. + """ + + # 1) Detokenize the new token ids incrementally. + # TODO(woosuk): This method becomes very inefficient when the number of + # new_token_ids is more than 1. We need to optimize this. + decoded_text = "" + for new_token_id in new_token_ids: + self.token_ids.append(new_token_id) + (new_tokens, new_decoded_token_text, prefix_offset, + read_offset) = detokenize_incrementally( + tokenizer=self.tokenizer, + all_input_ids=self.token_ids, + prev_tokens=self.tokens, + prefix_offset=self.prefix_offset, + read_offset=self.read_offset, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self. + spaces_between_special_tokens, + ) + + self.tokens.extend(new_tokens) + self.prefix_offset = prefix_offset + self.read_offset = read_offset + self.output_text += new_decoded_token_text + + decoded_text += new_decoded_token_text + + # 2) Evaluate stop criteria. + if self.stop: + stop = StopChecker.check_stop_strings( + output_text=self.output_text, + new_char_count=len(decoded_text), + stop=self.stop, + include_in_output=self.include_stop_str_in_output, + ) + if stop is not None: + stop_str, truncate_to = stop + if truncate_to != -1: + self.output_text = self.output_text[:truncate_to] + finish_reason = "stop" # TODO: use constant + stop_reason = stop_str + + # TODO: handle stop_token_ids here too? + + # 3) Update the RequestOutput object with the new text. + finished = bool(finish_reason) + if self.output_kind == RequestOutputKind.FINAL_ONLY \ + and not finished: + return None + + delta = self.output_kind == RequestOutputKind.DELTA + output_text = self._get_next_output_text(finished, delta) + token_ids = new_token_ids if delta else self.output_token_ids + + request_output = RequestOutput.new( + self.request_id, + self.prompt, + self.prompt_token_ids, + output_text, + token_ids, + finished, + ) + + if finished: + completion_output = request_output.outputs[0] + completion_output.finish_reason = finish_reason + completion_output.stop_reason = stop_reason + + return request_output + + def _get_next_output_text(self, finished: bool, delta: bool) -> str: + """If delta is True, only new text since the last call to + this method is returned""" + + # We return the full output text if the sequence is finished. + buffer_length = 0 if finished else self.stop_buffer_length + if not delta: + return self.output_text[:-buffer_length] if buffer_length else ( + self.output_text) + length = len(self.output_text) - buffer_length + last_offset = self._last_output_text_offset + if last_offset < length: + self._last_output_text_offset = length + return self.output_text[last_offset:length] + return "" + + +class OutputProcessor: + + def __init__(self, + tokenizer_name: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + revision: Optional[str] = None): + # TODO: once we support LoRA, we should should pass the tokenizer + # here. We currently have two copies (this + in the LLMEngine). + self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + revision=revision) + + # Request id -> IncrementalDetokenizer + self.request_states: Dict[str, IncrementalDetokenizer] = {} + + def is_request_active(self, request_id: str): + return request_id in self.request_states + + def get_num_unfinished_requests(self): + return len(self.request_states) + + def has_unfinished_requests(self) -> bool: + return len(self.request_states) > 0 + + def abort_requests( + self, + request_ids: Iterable[str], + ) -> None: + """Remove the request_ids from the Detokenizer.""" + + for request_id in request_ids: + self.request_states.pop(request_id, None) + + def add_request( + self, + request: EngineCoreRequest, + ): + """Add new request to the Detokenizer.""" + + assert (request.request_id not in self.request_states) + + request_state = IncrementalDetokenizer.from_new_request( + self.tokenizer, request) + self.request_states[request.request_id] = request_state + + def step( + self, encore_core_outputs: List[EngineCoreOutput] + ) -> OutputProcessorOutput: + """Update state and request the RequestOutputs to the LLMEngine.""" + + iteraton_stats = IterationStats() + request_outputs: List[RequestOutput] = [] + requests_to_abort: List[str] = [] + for engine_core_output in encore_core_outputs: + request_id = engine_core_output.request_id + detokenizer = self.request_states.get(request_id) + if detokenizer is None: + # Ignore output for already-aborted request. + continue + + # Detokenize and update state. + request_output = detokenizer.add_tokens( + new_token_ids=engine_core_output.new_token_ids, + finish_reason=engine_core_output.finish_reason, + stop_reason=engine_core_output.stop_reason, + ) + + if request_output is not None: + # Add to RequestOutputs list. + request_outputs.append(request_output) + + # Free completed requests. + if request_output.finished: + self.request_states.pop(request_id) + if not engine_core_output.finished: + requests_to_abort.append(request_id) + + # Return to EngineClient. + return OutputProcessorOutput( + request_outputs=request_outputs, + requests_to_abort=requests_to_abort, + iteration_stats=iteraton_stats, + ) From 49ca9bbdd5a53268cfb6c894978d3d00e477e149 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 18:44:21 +0000 Subject: [PATCH 12/67] added all files --- vllm/v1/engine/async_llm.py | 117 +++++----- vllm/v1/engine/core_client.py | 6 +- vllm/v1/engine/detokenizer.py | 180 ++++++++++++++++ vllm/v1/engine/llm_engine.py | 68 +++--- vllm/v1/engine/output_processor.py | 332 ++++++++--------------------- vllm/v1/engine/processor.py | 5 +- vllm/v1/engine/request_state.py | 46 ++++ vllm/v1/metrics/stats.py | 24 ++- 8 files changed, 428 insertions(+), 350 deletions(-) create mode 100644 vllm/v1/engine/detokenizer.py create mode 100644 vllm/v1/engine/request_state.py diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d5563b5d481cf..d7068d1262843 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -20,9 +20,10 @@ from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor +from vllm.v1.engine.request_state import RequestState from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase -from vllm.v1.metrics.stats import SchedulerStats +from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -52,31 +53,29 @@ def __init__( self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). - self.tokenizer = init_tokenizer_from_configs( + self.tokenizer_group = init_tokenizer_from_configs( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, lora_config=vllm_config.lora_config) - self.tokenizer.ping() + self.tokenizer_group.ping() - # Request streams (map of request_id -> queue). - self.rid_to_queue: Dict[str, asyncio.Queue] = {} + # Request States (map of request_id -> RequestState). + self.request_states: Dict[str, RequestState] = {} - # Processor (converts Inputs --> EngineCoreRequests). + # Processor (convert Inputs --> EngineCoreRequests). self.processor = Processor( model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer, + tokenizer_group=self.tokenizer_group, input_registry=input_registry, ) - # Detokenizer (converts EngineCoreOutputs --> RequestOutput). + # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor( - tokenizer_name=vllm_config.model_config.tokenizer, - tokenizer_mode=vllm_config.model_config.tokenizer_mode, - trust_remote_code=vllm_config.model_config.trust_remote_code, - revision=vllm_config.model_config.tokenizer_revision, + request_states=self.request_states, + log_stats=self.log_stats, ) # EngineCore (starts the engine in background process). @@ -139,28 +138,31 @@ async def add_request( ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" - # 1) Create a new output queue for the request. - if request_id in self.rid_to_queue: + if request_id in self.request_states: raise ValueError(f"Request id {request_id} already running.") - self.rid_to_queue[request_id] = asyncio.Queue() - # 2) Convert Input --> Request. + # 1) Convert Input --> Request. request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 3) Add the request to Detokenizer (this process). - self.output_processor.add_request(request) + # 2) Make a nnew RequestState and queue. + queue: asyncio.Queue[RequestOutput] = asyncio.Queue() + self.request_states[request_id] = RequestState.from_new_request( + tokenizer=self.get_tokenizer(), + request=request, + queue=queue, + ) - # 4) Add the EngineCoreRequest to EngineCore (separate process). + # 3) Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(request) if self.log_requests: logger.info("Added request %s.", request_id) - return self.rid_to_queue[request_id] + return queue # TODO: we should support multiple prompts in one call, as you # can do with LLM.generate. So that for multi-prompt completion @@ -197,8 +199,7 @@ async def generate( # we can call __init__ before the event loop, which enables us # to handle startup failure gracefully in the OpenAI server. if self.output_handler is None: - self.output_handler = asyncio.create_task( - self._run_output_handler()) + self.output_handler = asyncio.create_task(self.step_async()) q = await self.add_request( request_id, @@ -217,53 +218,40 @@ async def generate( # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() - # Note: both Detokenizer and EngineCore handle their - # own request cleanup based on finished. + # Note: OutputProcessor removes from request_states. if out.finished: - del self.rid_to_queue[request_id] yield out break yield out - # If the request is disconnected by the client, the - # generate() task will be canceled. So, we abort the - # request if we end up here. + # If the request is disconnected by the client, the generate() + # task will be canceled. So, we abort the request if we end up here. except asyncio.CancelledError: await self.abort(request_id) raise - def _process_request_outputs(self, request_outputs: List[RequestOutput]): - """Process outputs by putting them into per-request queues.""" - - for request_output in request_outputs: - request_id = request_output.request_id - - # Note: it is possible a request was aborted and removed from - # the state due to client cancellations, so if we encounter a - # request id not in the state, we skip. - if request_id in self.rid_to_queue: - self.rid_to_queue[request_id].put_nowait(request_output) - - async def _run_output_handler(self): - """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + async def step_async(self): + """Busy loop: Pull From EngineCore -> Process -> Push to Queues""" try: while True: - # 1) Pull EngineCoreOutput from the EngineCore. + # 1) Pull EngineCoreOutputs from the EngineCore. engine_core_outputs = await self.engine_core.get_output_async() - # 2) Detokenize based on the output. - processed_outputs = self.output_processor.step(engine_core_outputs.outputs) + # 2) Process EngineCoreOutputs, pushing RequestOutputs into + # asyncio queues for handling by the per-req generate() task. + processed_outputs = self.output_processor.process_outputs( + engine_core_outputs) - # 3) Put the RequestOutputs into the per-request queues. - self._process_request_outputs(processed_outputs.request_outputs) + # 3) Abort any reqs that finished due to stop strings. + await self.engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) - # 4) Abort any requests that finished due to stop strings. - await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort) - - # 5) Log any stats. - await self._log_stats( + # 4) Logging. + # TODO(rob): make into a coroutine and launch it in + # background thread once we add Prometheus. + self._log_stats( scheduler_stats=engine_core_outputs.scheduler_stats, iteration_stats=processed_outputs.iteration_stats, ) @@ -273,19 +261,21 @@ async def _run_output_handler(self): kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: - """Abort RequestId in self, detokenizer, and engine core.""" + """Abort a Request.""" - request_ids = [request_id] - await self.engine_core.abort_requests_async(request_ids) - self.output_processor.abort_requests(request_ids) + # Remove from EngineCore. + await self.engine_core.abort_requests_async([request_id]) - # If a request finishes while we await then the request_id - # will be removed from the tracked queues before we get here. - if request_id in self.rid_to_queue: - del self.rid_to_queue[request_id] + # Remove from AsyncLLM. + # Note: the request can finish during await, so check to make + # sure it is still active in the tracker before we pop. + _ = self.request_states.pop(request_id, None) - async def _log_stats(self, scheduler_stats: SchedulerStats): - """Log stats to the stat loggers.""" + def _log_stats( + self, + scheduler_stats: SchedulerStats, + iteration_stats: IterationStats, + ): if not self.log_stats: return @@ -316,8 +306,7 @@ async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: - assert lora_request is None - return self.output_processor.tokenizer + return self.tokenizer_group.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: return False diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 176fa839c8f58..1c680b83da065 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -105,7 +105,8 @@ def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) def abort_requests(self, request_ids: List[str]) -> None: - self.engine_core.abort_requests(request_ids) + if len(request_ids) > 0: + self.engine_core.abort_requests(request_ids) def shutdown(self): self.engine_core.shutdown() @@ -218,7 +219,8 @@ def add_request(self, request: EngineCoreRequest) -> None: self._send_input(EngineCoreRequestType.ADD, request) def abort_requests(self, request_ids: List[str]) -> None: - self._send_input(EngineCoreRequestType.ABORT, request_ids) + if len(request_ids) > 0: + self._send_input(EngineCoreRequestType.ABORT, request_ids) def profile(self, is_start: bool = True) -> None: self._send_input(EngineCoreRequestType.PROFILE, diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py new file mode 100644 index 0000000000000..7f6c8036f7890 --- /dev/null +++ b/vllm/v1/engine/detokenizer.py @@ -0,0 +1,180 @@ +from dataclasses import dataclass +from typing import List, Optional, Union + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.logger import init_logger +from vllm.sampling_params import RequestOutputKind +from vllm.transformers_utils.detokenizer_utils import ( + AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest + + +logger = init_logger(__name__) + +@dataclass +class DetokenizerOutput: + output_text: str + token_ids: List[int] + finished: bool + finish_reason: Optional[str] = None + stop_reason: Union[int, str, None] = None + + +@dataclass +class Detokenizer: + + # Generation data + output_text: str + tokens: List[str] + token_ids: List[int] + # Length of the prompt (in token_ids) + prompt_len: int + + # Stop strings + stop: List[str] + include_stop_str_in_output: bool + + # Metadata for incremental detokenization + prefix_offset: int + read_offset: int + + # Parameters for detokenization + skip_special_tokens: bool + spaces_between_special_tokens: bool + output_kind: RequestOutputKind + + # Tokenizer for this request + tokenizer: AnyTokenizer + + # Accounting for stop string buffering + stop_buffer_length: int + _last_output_text_offset: int = 0 + + @property + def output_token_ids(self) -> List[int]: + return self.token_ids[self.prompt_len:] + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: EngineCoreRequest, + ) -> "Detokenizer": + + tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=request.prompt_token_ids, + skip_special_tokens=request.sampling_params.skip_special_tokens, + ) + + stops = request.sampling_params.stop + # Number of chars to hold back when stop strings are to be excluded + # from streamed output. + if stops and not request.sampling_params.include_stop_str_in_output: + stop_buffer_length = max(len(s) for s in stops) - 1 + else: + stop_buffer_length = 0 + + return cls( + output_text="", + tokens=tokens, + # Detokenizer mutates this list, so need a unique copy. + # NOTE(Nick): could we take ownership of it though? + token_ids=request.prompt_token_ids.copy(), + stop=stops, + include_stop_str_in_output=request.sampling_params. + include_stop_str_in_output, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=request.sampling_params.skip_special_tokens, + spaces_between_special_tokens=request.sampling_params. + spaces_between_special_tokens, + output_kind=request.sampling_params.output_kind, + prompt_len=len(request.prompt_token_ids), + tokenizer=tokenizer, + stop_buffer_length=stop_buffer_length, + ) + + def update_from_output( + self, + output: EngineCoreOutput, + ) -> Optional[DetokenizerOutput]: + """ + Update RequestState for the request_id by: + 1) Detokenize the new token ids incrementally. + 2) Update the RequestOutput with the new text. + """ + + new_token_ids = output.new_token_ids + finish_reason = output.finish_reason + stop_reason = output.stop_reason + + # 1) Detokenize the new token ids incrementally. + # TODO(woosuk): This method becomes very inefficient when the number of + # new_token_ids is more than 1. We need to optimize this. + decoded_text = "" + for new_token_id in output.new_token_ids: + self.token_ids.append(new_token_id) + (new_tokens, new_decoded_token_text, prefix_offset, + read_offset) = detokenize_incrementally( + tokenizer=self.tokenizer, + all_input_ids=self.token_ids, + prev_tokens=self.tokens, + prefix_offset=self.prefix_offset, + read_offset=self.read_offset, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self. + spaces_between_special_tokens, + ) + + self.tokens.extend(new_tokens) + self.prefix_offset = prefix_offset + self.read_offset = read_offset + self.output_text += new_decoded_token_text + + decoded_text += new_decoded_token_text + + # 2) Evaluate stop criteria. + if self.stop: + stop = StopChecker.check_stop_strings( + output_text=self.output_text, + new_char_count=len(decoded_text), + stop=self.stop, + include_in_output=self.include_stop_str_in_output, + ) + if stop is not None: + stop_str, truncate_to = stop + if truncate_to != -1: + self.output_text = self.output_text[:truncate_to] + finish_reason = "stop" # TODO: use constant + stop_reason = stop_str + + # 3) Update the RequestOutput object with the new text. + finished = bool(finish_reason) + if self.output_kind == RequestOutputKind.FINAL_ONLY \ + and not finished: + return None + + delta = self.output_kind == RequestOutputKind.DELTA + output_text = self._get_next_output_text(finished, delta) + token_ids = new_token_ids if delta else self.output_token_ids + + return DetokenizerOutput( + output_text, token_ids, finished, finish_reason, stop_reason) + + + def _get_next_output_text(self, finished: bool, delta: bool) -> str: + """If delta is True, only new text since the last call to + this method is returned""" + + # We return the full output text if the sequence is finished. + buffer_length = 0 if finished else self.stop_buffer_length + if not delta: + return self.output_text[:-buffer_length] if buffer_length else ( + self.output_text) + length = len(self.output_text) - buffer_length + last_offset = self._last_output_text_offset + if last_offset < length: + self._last_output_text_offset = length + return self.output_text[last_offset:length] + return "" diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 27321752c3efa..5a6f75ea70f2a 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -15,11 +15,12 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import ( - BaseTokenizerGroup, init_tokenizer_from_configs) + AnyTokenizer, BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor +from vllm.v1.engine.request_state import RequestState from vllm.v1.executor.abstract import Executor logger = init_logger(__name__) @@ -45,27 +46,28 @@ def __init__( self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). - self.tokenizer = init_tokenizer_from_configs( + self.tokenizer_group = init_tokenizer_from_configs( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, lora_config=vllm_config.lora_config) - self.tokenizer.ping() + self.tokenizer_group.ping() + + # Request States (map of request_id -> RequestState). + self.request_states: Dict[str, RequestState] = {} # Processor (convert Inputs --> EngineCoreRequests) self.processor = Processor(model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer, + tokenizer_group=self.tokenizer_group, input_registry=input_registry, mm_registry=mm_registry) - # Detokenizer (converts EngineCoreOutputs --> RequestOutput) + # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor( - tokenizer_name=vllm_config.model_config.tokenizer, - tokenizer_mode=vllm_config.model_config.tokenizer_mode, - trust_remote_code=vllm_config.model_config.trust_remote_code, - revision=vllm_config.model_config.tokenizer_revision, + request_states=self.request_states, + log_stats=False, ) # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) @@ -103,21 +105,15 @@ def from_engine_args( multiprocess_mode=enable_multiprocessing) def get_num_unfinished_requests(self) -> int: - return self.output_processor.get_num_unfinished_requests() + return len(self.request_states) def has_unfinished_requests(self) -> bool: - return self.output_processor.has_unfinished_requests() + return self.get_num_unfinished_requests() > 0 @classmethod def validate_outputs(cls, outputs, output_type): return outputs - def abort_request(self, request_ids: List[str]) -> None: - """Remove request_ids from EngineCore and Detokenizer.""" - - self.engine_core.abort_requests(request_ids) - self.output_processor.abort_requests(request_ids) - def add_request( self, request_id: str, @@ -130,33 +126,39 @@ def add_request( priority: int = 0, ) -> None: - # 1) Process raw inputs into the request. + if request_id in self.request_states: + raise ValueError(f"Request id {request_id} already running.") + + # 1) Convert Input --> Request. request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 2) Add the request to Detokenizer. - self.output_processor.add_request(request) + # 2) Make a new RequestState and queue. + self.request_states[request_id] = RequestState.from_new_request( + tokenizer=self.get_tokenizer(), + request=request, + ) # 3) Add the request to EngineCore. self.engine_core.add_request(request) def step(self) -> List[RequestOutput]: + """Pull From EngineCore -> Process -> Return RequestOutput.""" - # 1) Get EngineCoreOutput from the EngineCore. - outputs = self.engine_core.get_output() + # 1) Pull EngineCoreOutput from the EngineCore. + engine_core_outputs = self.engine_core.get_output() - # 2) Detokenizer the EngineCoreOutput. - request_outputs, requests_to_abort = self.detokenizer.step( - outputs.outputs) + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + engine_core_outputs) - # 3) Abort requests that finished due to stopping criteria. - if requests_to_abort: - self.abort_request(requests_to_abort) + # 3) Abort any reqs that finished due to stop strings. + self.engine_core.abort_requests(processed_outputs.reqs_to_abort) - return request_outputs + return processed_outputs.request_outputs def get_model_config(self): return self.model_config @@ -171,7 +173,7 @@ def get_tokenizer_group( self, group_type: Type[_G] = BaseTokenizerGroup, ) -> _G: - tokenizer_group = self.tokenizer + tokenizer_group = self.tokenizer_group if tokenizer_group is None: raise ValueError("Unable to get tokenizer because " @@ -182,3 +184,9 @@ def get_tokenizer_group( f"found type: {type(tokenizer_group)}") return tokenizer_group + + async def get_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + return self.get_tokenizer_group().get_lora_tokenizer(lora_request) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index e2fcf29cf4915..755e2790805e2 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,286 +1,124 @@ from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Dict, List, Optional -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.sampling_params import RequestOutputKind -from vllm.transformers_utils.detokenizer_utils import ( - AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine import EngineCoreOutputs +from vllm.v1.engine.detokenizer import DetokenizerOutput +from vllm.v1.engine.request_state import RequestState from vllm.v1.metrics.stats import IterationStats -logger = init_logger(__name__) @dataclass class OutputProcessorOutput: - """Output from the OutputProcessor.""" + """Output of the OutputProcessor.step() function.""" request_outputs: List[RequestOutput] - requests_to_abort: List[str] + reqs_to_abort: List[str] iteration_stats: IterationStats -@dataclass -class IncrementalDetokenizer: - - # Generation data - output_text: str - tokens: List[str] - token_ids: List[int] - - # Stop strings - stop: List[str] - include_stop_str_in_output: bool - - # Metadata for incremental detokenization - prefix_offset: int - read_offset: int - - # Parameters for detokenization - skip_special_tokens: bool - spaces_between_special_tokens: bool - output_kind: RequestOutputKind - - # TODO: Probably decouple these - request_id: str - prompt: Optional[str] - prompt_token_ids: List[int] - - # Tokenizer for this request - tokenizer: AnyTokenizer - - # Accounting for stop string buffering - stop_buffer_length: int - _last_output_text_offset: int = 0 - - @property - def output_token_ids(self) -> List[int]: - assert len(self.token_ids) >= len(self.prompt_token_ids) - return self.token_ids[len(self.prompt_token_ids):] - - @classmethod - def from_new_request( - cls, - tokenizer: AnyTokenizer, - request: EngineCoreRequest, - ) -> "IncrementalDetokenizer": - - tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( - tokenizer=tokenizer, - prompt_ids=request.prompt_token_ids, - skip_special_tokens=request.sampling_params.skip_special_tokens, - ) - stops = request.sampling_params.stop - # Number of chars to hold back when stop strings are to be excluded - # from streamed output. - if stops and not request.sampling_params.include_stop_str_in_output: - stop_buffer_length = max(len(s) for s in stops) - 1 - else: - stop_buffer_length = 0 +class OutputProcessor: - return cls( - output_text="", - tokens=tokens, - # Detokenizer mutates this list, so need a unique copy. - # NOTE(Nick): could we take ownership of it though? - token_ids=request.prompt_token_ids.copy(), - stop=stops, - include_stop_str_in_output=request.sampling_params. - include_stop_str_in_output, - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=request.sampling_params.skip_special_tokens, - spaces_between_special_tokens=request.sampling_params. - spaces_between_special_tokens, - output_kind=request.sampling_params.output_kind, - request_id=request.request_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, - tokenizer=tokenizer, - stop_buffer_length=stop_buffer_length, - ) + def __init__( + self, + request_states: Dict[str, RequestState], + log_stats: bool, + ): + self.request_states = request_states + self.log_stats = log_stats - def add_tokens( + def make_request_output( self, - new_token_ids: List[int], - finish_reason: Optional[str], - stop_reason: Optional[Union[int, str, None]], + request_state: RequestState, + detokenizer_output: Optional[DetokenizerOutput], ) -> Optional[RequestOutput]: - """ - Update RequestState for the request_id by: - 1) Detokenize the new token ids incrementally. - 2) Update the RequestOutput with the new text. - """ - - # 1) Detokenize the new token ids incrementally. - # TODO(woosuk): This method becomes very inefficient when the number of - # new_token_ids is more than 1. We need to optimize this. - decoded_text = "" - for new_token_id in new_token_ids: - self.token_ids.append(new_token_id) - (new_tokens, new_decoded_token_text, prefix_offset, - read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=self.token_ids, - prev_tokens=self.tokens, - prefix_offset=self.prefix_offset, - read_offset=self.read_offset, - skip_special_tokens=self.skip_special_tokens, - spaces_between_special_tokens=self. - spaces_between_special_tokens, - ) - - self.tokens.extend(new_tokens) - self.prefix_offset = prefix_offset - self.read_offset = read_offset - self.output_text += new_decoded_token_text - - decoded_text += new_decoded_token_text - - # 2) Evaluate stop criteria. - if self.stop: - stop = StopChecker.check_stop_strings( - output_text=self.output_text, - new_char_count=len(decoded_text), - stop=self.stop, - include_in_output=self.include_stop_str_in_output, - ) - if stop is not None: - stop_str, truncate_to = stop - if truncate_to != -1: - self.output_text = self.output_text[:truncate_to] - finish_reason = "stop" # TODO: use constant - stop_reason = stop_str - - # TODO: handle stop_token_ids here too? - # 3) Update the RequestOutput object with the new text. - finished = bool(finish_reason) - if self.output_kind == RequestOutputKind.FINAL_ONLY \ - and not finished: + if detokenizer_output is None: return None - delta = self.output_kind == RequestOutputKind.DELTA - output_text = self._get_next_output_text(finished, delta) - token_ids = new_token_ids if delta else self.output_token_ids - request_output = RequestOutput.new( - self.request_id, - self.prompt, - self.prompt_token_ids, - output_text, - token_ids, - finished, + request_state.request_id, + request_state.prompt, + request_state.prompt_token_ids, + detokenizer_output.output_text, + detokenizer_output.token_ids, + detokenizer_output.finished, ) - - if finished: + if detokenizer_output.finished: completion_output = request_output.outputs[0] - completion_output.finish_reason = finish_reason - completion_output.stop_reason = stop_reason + completion_output.finish_reason = detokenizer_output.finish_reason + completion_output.stop_reason = detokenizer_output.stop_reason return request_output - def _get_next_output_text(self, finished: bool, delta: bool) -> str: - """If delta is True, only new text since the last call to - this method is returned""" - - # We return the full output text if the sequence is finished. - buffer_length = 0 if finished else self.stop_buffer_length - if not delta: - return self.output_text[:-buffer_length] if buffer_length else ( - self.output_text) - length = len(self.output_text) - buffer_length - last_offset = self._last_output_text_offset - if last_offset < length: - self._last_output_text_offset = length - return self.output_text[last_offset:length] - return "" - - -class OutputProcessor: - - def __init__(self, - tokenizer_name: str, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - revision: Optional[str] = None): - # TODO: once we support LoRA, we should should pass the tokenizer - # here. We currently have two copies (this + in the LLMEngine). - self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, - tokenizer_mode=tokenizer_mode, - trust_remote_code=trust_remote_code, - revision=revision) - - # Request id -> IncrementalDetokenizer - self.request_states: Dict[str, IncrementalDetokenizer] = {} - - def is_request_active(self, request_id: str): - return request_id in self.request_states - - def get_num_unfinished_requests(self): - return len(self.request_states) - - def has_unfinished_requests(self) -> bool: - return len(self.request_states) > 0 - - def abort_requests( - self, - request_ids: Iterable[str], - ) -> None: - """Remove the request_ids from the Detokenizer.""" - - for request_id in request_ids: - self.request_states.pop(request_id, None) - - def add_request( - self, - request: EngineCoreRequest, - ): - """Add new request to the Detokenizer.""" - - assert (request.request_id not in self.request_states) - - request_state = IncrementalDetokenizer.from_new_request( - self.tokenizer, request) - self.request_states[request.request_id] = request_state - - def step( - self, encore_core_outputs: List[EngineCoreOutput] - ) -> OutputProcessorOutput: - """Update state and request the RequestOutputs to the LLMEngine.""" + def process_outputs(self, + outputs: EngineCoreOutputs) -> OutputProcessorOutput: + """ + Process the EngineCoreOutputs: + 1) Compute stats for logging + 2) Detokenize + 3) Create and handle RequestOutput objects: + * If self.stream_outputs (for usage with AsyncLLM), + we put RequestOutput objects into the asyncio queue + for handling by the per-request generate() tasks. + * If not self.stream_outputs (for usage with LLMEngine), + we return a list of RequestOutput objects. + + ****************** NOTE FOR DEVELOPERS ****************** + + VLLM V1 minimizes the number of python loops over the full + batch to ensure system overheads are minimized. This is the + only function that should loop over EngineCoreOutputs. + + If you need to touch every element of the batch, implement a + method called XXXClass.update_from_output() to be called + within the loop below. For examples, see: + * IterationStats.update_from_output() + * Detokenizer.update_from_output() + + ********************************************************** + """ - iteraton_stats = IterationStats() request_outputs: List[RequestOutput] = [] - requests_to_abort: List[str] = [] - for engine_core_output in encore_core_outputs: - request_id = engine_core_output.request_id - detokenizer = self.request_states.get(request_id) - if detokenizer is None: + reqs_to_abort: List[str] = [] + iteration_stats = IterationStats(self.log_stats) + for engine_core_output in outputs.outputs: + req_id = engine_core_output.request_id + req_state = self.request_states.get(req_id) + if req_state is None: # Ignore output for already-aborted request. continue - # Detokenize and update state. - request_output = detokenizer.add_tokens( - new_token_ids=engine_core_output.new_token_ids, - finish_reason=engine_core_output.finish_reason, - stop_reason=engine_core_output.stop_reason, - ) - - if request_output is not None: - # Add to RequestOutputs list. - request_outputs.append(request_output) + # 1) Compute stats for this iteration. + iteration_stats.update_from_output(engine_core_output, + req_state.is_prefilling, + req_state.prompt_len) + req_state.is_prefilling = False + + # 2) Detokenize the token ids into text. + detokenizer_output = req_state.detokenizer.update_from_output( + engine_core_output) + + # 3) Create and handle RequestOutput objects. + if request_output := self.make_request_output( + req_state, detokenizer_output): + if req_state.queue is not None: + # AsyncLLM: put into queue for handling by generate(). + req_state.queue.put_nowait(request_output) + else: + # LLMEngine: return list of RequestOutputs. + request_outputs.append(request_output) # Free completed requests. if request_output.finished: - self.request_states.pop(request_id) + self.request_states.pop(req_id) if not engine_core_output.finished: - requests_to_abort.append(request_id) + # If req not finished in EngineCore, but Detokenizer + # detected stop string, abort needed in EngineCore. + reqs_to_abort.append(req_id) - # Return to EngineClient. return OutputProcessorOutput( request_outputs=request_outputs, - requests_to_abort=requests_to_abort, - iteration_stats=iteraton_stats, + reqs_to_abort=reqs_to_abort, + iteration_stats=iteration_stats, ) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 43419d2ff5381..cc5980a6b96b5 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -25,19 +25,18 @@ def __init__( model_config: ModelConfig, cache_config: CacheConfig, lora_config: Optional[LoRAConfig], - tokenizer: BaseTokenizerGroup, + tokenizer_group: BaseTokenizerGroup, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): self.model_config = model_config self.lora_config = lora_config - self.tokenizer = tokenizer self.generation_config_fields = model_config.try_get_generation_config( ) self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer, + tokenizer_group, mm_registry) self.input_processor = input_registry.create_input_processor( model_config) diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py new file mode 100644 index 0000000000000..ed808ac456624 --- /dev/null +++ b/vllm/v1/engine/request_state.py @@ -0,0 +1,46 @@ +from typing import List, Optional + +import asyncio + +from vllm.outputs import RequestOutput +from vllm.transformers_utils.detokenizer_utils import AnyTokenizer +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.detokenizer import Detokenizer + + +class RequestState: + """RequestState for AsyncLLM and LLMEngine.""" + + def __init__( + self, + request_id: str, + prompt: Optional[str], + prompt_token_ids: List[int], + detokenizer: Detokenizer, + queue: Optional[asyncio.Queue[RequestOutput]], + ): + self.request_id = request_id + self.prompt = prompt + self.prompt_token_ids = prompt_token_ids + self.prompt_len = len(prompt_token_ids) + self.detokenizer = detokenizer + self.is_prefilling = True + self.queue = queue + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: EngineCoreRequest, + queue: Optional[asyncio.Queue[RequestOutput]] = None, + ) -> "RequestState": + return cls( + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + detokenizer=Detokenizer.from_new_request( + tokenizer=tokenizer, + request=request, + ), + queue=queue, + ) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index d9c7aa400005a..383aa200944eb 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,5 +1,7 @@ from dataclasses import dataclass +from vllm.v1.engine import EngineCoreOutput + @dataclass class SchedulerStats: @@ -12,9 +14,23 @@ class SchedulerStats: # gpu_prefix_cache_hit_rate: float = 0.0 -@dataclass class IterationStats: - """Stats associated with a single iteration""" + """Stats associated with a single set of EngineCoreOutputs.""" + + def __init__(self, log_stats: bool): + self.log_stats = log_stats + self.num_generation_tokens = 0 + self.num_prompt_tokens = 0 + + def update_from_output(self, + output: EngineCoreOutput, + is_prefilling: bool, + prompt_len: int = 0): + """Update the IterationStats with the EngineCoreOutput.""" + + if not self.log_stats: + return - num_generation_tokens: int = 0 - num_prompt_tokens: int = 0 + self.num_generation_tokens += len(output.new_token_ids) + if is_prefilling: + self.num_prompt_tokens += prompt_len From 86d33a16717f3f2261e1d947ed33f1e5568c8692 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 18:58:11 +0000 Subject: [PATCH 13/67] stash --- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/metrics/stats.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d7068d1262843..5cf8b3a62eb91 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -151,7 +151,7 @@ async def add_request( # 2) Make a nnew RequestState and queue. queue: asyncio.Queue[RequestOutput] = asyncio.Queue() self.request_states[request_id] = RequestState.from_new_request( - tokenizer=self.get_tokenizer(), + tokenizer=(await self.get_tokenizer()), request=request, queue=queue, ) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 383aa200944eb..5deb50f4795bb 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,6 +1,8 @@ +from typing import TYPE_CHECKING from dataclasses import dataclass -from vllm.v1.engine import EngineCoreOutput +if TYPE_CHECKING: + from vllm.v1.engine import EngineCoreOutput @dataclass @@ -23,7 +25,7 @@ def __init__(self, log_stats: bool): self.num_prompt_tokens = 0 def update_from_output(self, - output: EngineCoreOutput, + output: "EngineCoreOutput", is_prefilling: bool, prompt_len: int = 0): """Update the IterationStats with the EngineCoreOutput.""" From 4066fc84a4c751d6d7ff4a2455399e12fb3fd53a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 19:07:45 +0000 Subject: [PATCH 14/67] working again --- tests/v1/engine/test_engine_core.py | 4 ++-- tests/v1/engine/test_engine_core_client.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 5b1732036e807..cccfd305ac604 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -80,7 +80,7 @@ def test_engine_core(monkeypatch): assert len(engine_core.scheduler.running) == 4 # Loop through until they are all done. - while len(engine_core.step()) > 0: + while len(engine_core.step().outputs) > 0: pass assert len(engine_core.scheduler.waiting) == 0 @@ -170,7 +170,7 @@ def test_engine_core_advanced_sampling(monkeypatch): assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 # Loop through until they are all done. - while len(engine_core.step()) > 0: + while len(engine_core.step().outputs) > 0: pass assert len(engine_core.scheduler.waiting) == 0 diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 7eac16f2cf542..e2c728b22d481 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -43,7 +43,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: def loop_until_done(client: EngineCoreClient, outputs: Dict): while True: - engine_core_outputs = client.get_output() + engine_core_outputs = client.get_output().outputs if len(engine_core_outputs) == 0: break @@ -61,7 +61,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict): async def loop_until_done_async(client: EngineCoreClient, outputs: Dict): while True: - engine_core_outputs = await client.get_output_async() + engine_core_outputs = await client.get_output_async().outputs if len(engine_core_outputs) == 0: break From c9ffc60d2e051eecc13f0ab639ee64cddc316991 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 19:15:37 +0000 Subject: [PATCH 15/67] fixed sorting --- vllm/v1/engine/detokenizer.py | 1 - vllm/v1/engine/request_state.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 7f6c8036f7890..c93795974e347 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -8,7 +8,6 @@ AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest - logger = init_logger(__name__) @dataclass diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py index ed808ac456624..572b6d977f350 100644 --- a/vllm/v1/engine/request_state.py +++ b/vllm/v1/engine/request_state.py @@ -1,6 +1,5 @@ -from typing import List, Optional - import asyncio +from typing import List, Optional from vllm.outputs import RequestOutput from vllm.transformers_utils.detokenizer_utils import AnyTokenizer From e34b9dc2e0910472f3c46153efd8b81104d0c879 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:14:08 +0000 Subject: [PATCH 16/67] merged --- tests/v1/engine/test_async_llm.py | 61 +++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index fffb5b8100ec7..a9fa8152ed87d 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -13,21 +13,38 @@ allow_module_level=True) ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", + enforce_eager=True, disable_log_requests=True) -async def generate(engine: AsyncLLM, request_id: str, - max_tokens: int) -> Tuple[int, str]: - count = 0 - async for _ in engine.generate(request_id=request_id, - prompt="Hello my name is Robert and", - sampling_params=SamplingParams( - max_tokens=max_tokens, temperature=0)): - - count += 1 - await asyncio.sleep(0.) +async def run_example( + engine: AsyncLLM, + request_id: str, + num_tokens: int, + abort_after: int = 0 +) -> Tuple[int, int, str]: + + generator = engine.generate( + request_id=request_id, + prompt="Hello my name is Robert and", + sampling_params=SamplingParams(max_tokens=num_tokens, temperature=0)) - return count, request_id + count = 0 + try: + async for _ in generator(): + count += 1 + print(f"{request_id=}, {count=}, {abort_after=}") + if count == abort_after: + # Simulate request cancellation. + print(f"{request_id=}") + asyncio.current_task().cancel() + except asyncio.CancelledError: + print(f"{request_id=}") + assert request_id not in engine.request_states + finally: + + expected_count = num_tokens if abort_after == 0 else abort_after + return count, expected_count, request_id @pytest.mark.asyncio @@ -40,24 +57,30 @@ async def test_load(monkeypatch): engine = AsyncLLM.from_engine_args(ENGINE_ARGS) - NUM_REQUESTS = 10000 + NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 10 + # Abort 1/100 requests after 5 tokens. + ABORT_RATE = 100 + ABORT_AFTER = 5 request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] # Create concurrent requests. - tasks = [] - for request_id in request_ids: - tasks.append( - asyncio.create_task( - generate(engine, request_id, NUM_EXPECTED_TOKENS))) + tasks = [ + asyncio.create_task(run_example( + engine=engine, + request_id=request_id, + num_tokens=NUM_EXPECTED_TOKENS, + abort_after=(ABORT_AFTER if idx % ABORT_RATE == 0 else 0) + )) for idx, request_id in enumerate(request_ids) + ] # Confirm that we got all the EXPECTED tokens from the requests. failed_request_id = None tokens = None for task in tasks: - num_generated_tokens, request_id = await task - if (num_generated_tokens != NUM_EXPECTED_TOKENS + num_generated_tokens, expected_tokens, request_id = await task + if (num_generated_tokens != expected_tokens and failed_request_id is None): failed_request_id = request_id tokens = num_generated_tokens From dd6e3d60fed08dc19b547dfd0cf3a1574ceb6b84 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:18:07 +0000 Subject: [PATCH 17/67] reduce number of changes --- vllm/v1/engine/processor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index cc5980a6b96b5..43419d2ff5381 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -25,18 +25,19 @@ def __init__( model_config: ModelConfig, cache_config: CacheConfig, lora_config: Optional[LoRAConfig], - tokenizer_group: BaseTokenizerGroup, + tokenizer: BaseTokenizerGroup, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): self.model_config = model_config self.lora_config = lora_config + self.tokenizer = tokenizer self.generation_config_fields = model_config.try_get_generation_config( ) self.input_preprocessor = InputPreprocessor(model_config, - tokenizer_group, + self.tokenizer, mm_registry) self.input_processor = input_registry.create_input_processor( model_config) From dbd86b8591b4ea7d468c5b6ebe5f2383d9f33f38 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:18:42 +0000 Subject: [PATCH 18/67] reduce changes --- vllm/v1/metrics/loggers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 4ed9193128647..6b2fd02b8aed2 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -17,6 +17,7 @@ def log(self, scheduler_stats: SchedulerStats): class LoggingStatLogger(StatLoggerBase): + """Log Stats to standard output.""" def __init__(self): self.last_log_time = time.monotonic() From ebf3530c82736d5d6f570614a9345bf78ddd0549 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:19:31 +0000 Subject: [PATCH 19/67] reduce changes --- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/llm_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a78c67a70bbc9..da0bd252b5263 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -72,7 +72,7 @@ def __init__( model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, - tokenizer_group=self.tokenizer_group, + tokenizer=self.tokenizer_group, input_registry=input_registry, ) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 5a6f75ea70f2a..24094b43a984d 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -60,7 +60,7 @@ def __init__( self.processor = Processor(model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, - tokenizer_group=self.tokenizer_group, + tokenizer=self.tokenizer_group, input_registry=input_registry, mm_registry=mm_registry) From 7b6d9b3cbe3987899edfaa6f1de02564d1e0fc0e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:20:18 +0000 Subject: [PATCH 20/67] updared --- vllm/v1/metrics/loggers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6b2fd02b8aed2..8feeef17542e6 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -17,12 +17,12 @@ def log(self, scheduler_stats: SchedulerStats): class LoggingStatLogger(StatLoggerBase): - """Log Stats to standard output.""" def __init__(self): self.last_log_time = time.monotonic() def log(self, scheduler_stats: SchedulerStats): + """Log Stats to standard output.""" # Log every _LOCAL_LOGGING_INTERVAL_SEC. now = time.monotonic() From 707796f11c865214855681dca0f2531e86cd8598 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:22:59 +0000 Subject: [PATCH 21/67] make pr more reviewable --- vllm/v1/engine/llm_engine.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 24094b43a984d..4c272b1b483ab 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -46,12 +46,12 @@ def __init__( self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). - self.tokenizer_group = init_tokenizer_from_configs( + self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, lora_config=vllm_config.lora_config) - self.tokenizer_group.ping() + self.tokenizer.ping() # Request States (map of request_id -> RequestState). self.request_states: Dict[str, RequestState] = {} @@ -60,7 +60,7 @@ def __init__( self.processor = Processor(model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer_group, + tokenizer=self.tokenizer, input_registry=input_registry, mm_registry=mm_registry) @@ -138,7 +138,7 @@ def add_request( # 2) Make a new RequestState and queue. self.request_states[request_id] = RequestState.from_new_request( - tokenizer=self.get_tokenizer(), + tokenizer=self.get_tokenizer_group().get_lora_tokenizer(lora_request), request=request, ) @@ -173,7 +173,7 @@ def get_tokenizer_group( self, group_type: Type[_G] = BaseTokenizerGroup, ) -> _G: - tokenizer_group = self.tokenizer_group + tokenizer_group = self.tokenizer if tokenizer_group is None: raise ValueError("Unable to get tokenizer because " @@ -184,9 +184,3 @@ def get_tokenizer_group( f"found type: {type(tokenizer_group)}") return tokenizer_group - - async def get_tokenizer( - self, - lora_request: Optional[LoRARequest] = None, - ) -> AnyTokenizer: - return self.get_tokenizer_group().get_lora_tokenizer(lora_request) From df72c8f320618156aa8c716dc40ebbc1a0017368 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:24:51 +0000 Subject: [PATCH 22/67] update comments --- vllm/v1/engine/output_processor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 755e2790805e2..5725a19b8dbc9 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -58,11 +58,12 @@ def process_outputs(self, 1) Compute stats for logging 2) Detokenize 3) Create and handle RequestOutput objects: - * If self.stream_outputs (for usage with AsyncLLM), - we put RequestOutput objects into the asyncio queue - for handling by the per-request generate() tasks. - * If not self.stream_outputs (for usage with LLMEngine), - we return a list of RequestOutput objects. + * If there is a queue (for usage with AsyncLLM), + put the RequestOutput objects into the queue for + handling by the per-request generate() tasks. + + * If there is no queue (for usage with LLMEngine), + return a list of RequestOutput objects. ****************** NOTE FOR DEVELOPERS ****************** From 9d67efc40e45322cef9d9bd8d4a3fd001e689c25 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:27:27 +0000 Subject: [PATCH 23/67] make PR more readable --- vllm/v1/engine/async_llm.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index da0bd252b5263..7ca22f6c6c7a0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -57,12 +57,12 @@ def __init__( self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). - self.tokenizer_group = init_tokenizer_from_configs( + self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, lora_config=vllm_config.lora_config) - self.tokenizer_group.ping() + self.tokenizer.ping() # Request States (map of request_id -> RequestState). self.request_states: Dict[str, RequestState] = {} @@ -72,7 +72,7 @@ def __init__( model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, lora_config=vllm_config.lora_config, - tokenizer=self.tokenizer_group, + tokenizer=self.tokenizer, input_registry=input_registry, ) @@ -155,7 +155,7 @@ async def add_request( # 2) Make a nnew RequestState and queue. queue: asyncio.Queue[RequestOutput] = asyncio.Queue() self.request_states[request_id] = RequestState.from_new_request( - tokenizer=(await self.get_tokenizer()), + tokenizer=(await self.get_tokenizer(lora_request)), request=request, queue=queue, ) @@ -265,14 +265,13 @@ async def step_async(self): kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: - """Abort a Request.""" + """Abort RequestId in AsyncLLM and EngineCore.""" - # Remove from EngineCore. - await self.engine_core.abort_requests_async([request_id]) + request_ids = [request_id] + await self.engine_core.abort_requests_async(request_ids) - # Remove from AsyncLLM. - # Note: the request can finish during await, so check to make - # sure it is still active in the tracker before we pop. + # If a request finishes while we await then the request_id + # will be removed from the tracked queues before we get here. _ = self.request_states.pop(request_id, None) def _log_stats( @@ -310,7 +309,7 @@ async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, ) -> AnyTokenizer: - return self.tokenizer_group.get_lora_tokenizer(lora_request) + return self.tokenizer.get_lora_tokenizer(lora_request) async def is_tracing_enabled(self) -> bool: return False From 1cae7836148bea3c35641675ae3d2dc435167e58 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:27:57 +0000 Subject: [PATCH 24/67] reduce cruft --- vllm/v1/engine/async_llm.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7ca22f6c6c7a0..27663562ea990 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -50,10 +50,6 @@ def __init__( LoggingStatLogger(), # TODO(rob): PrometheusStatLogger(), ] - self.stat_loggers: List[StatLoggerBase] = [ - LoggingStatLogger(), - # TODO(rob): PrometheusStatLogger(), - ] self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). From 6401cfa0dbcb2998c662537f78a07134e6efbbb0 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:28:29 +0000 Subject: [PATCH 25/67] reduce changes --- vllm/v1/engine/async_llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 27663562ea990..6de28bc6d3fbf 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -63,7 +63,7 @@ def __init__( # Request States (map of request_id -> RequestState). self.request_states: Dict[str, RequestState] = {} - # Processor (convert Inputs --> EngineCoreRequests). + # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor( model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, @@ -72,7 +72,7 @@ def __init__( input_registry=input_registry, ) - # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). + # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor( request_states=self.request_states, log_stats=self.log_stats, From 33bc01d02f35265cf80f93c89dc0d6f6d7b01993 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:29:14 +0000 Subject: [PATCH 26/67] reduce changes --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6de28bc6d3fbf..d06455a91df88 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -148,7 +148,7 @@ async def add_request( prompt_adapter_request, priority) - # 2) Make a nnew RequestState and queue. + # 2) Add the request to AsyncLLM. queue: asyncio.Queue[RequestOutput] = asyncio.Queue() self.request_states[request_id] = RequestState.from_new_request( tokenizer=(await self.get_tokenizer(lora_request)), From 7dda30541ee657358eaba52fc40846e610821f87 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:30:00 +0000 Subject: [PATCH 27/67] updated --- vllm/v1/engine/detokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index c93795974e347..728c306228f87 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -26,7 +26,6 @@ class Detokenizer: output_text: str tokens: List[str] token_ids: List[int] - # Length of the prompt (in token_ids) prompt_len: int # Stop strings From 769cff54e1bb1efccc4c3dd83863c8d822054f0c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:30:53 +0000 Subject: [PATCH 28/67] reduce changes --- vllm/v1/engine/detokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 728c306228f87..8d9ca1c805ecc 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -111,7 +111,7 @@ def update_from_output( # TODO(woosuk): This method becomes very inefficient when the number of # new_token_ids is more than 1. We need to optimize this. decoded_text = "" - for new_token_id in output.new_token_ids: + for new_token_id in new_token_ids: self.token_ids.append(new_token_id) (new_tokens, new_decoded_token_text, prefix_offset, read_offset) = detokenize_incrementally( @@ -147,6 +147,8 @@ def update_from_output( finish_reason = "stop" # TODO: use constant stop_reason = stop_str + # TODO: handle stop_token_ids here too? + # 3) Update the RequestOutput object with the new text. finished = bool(finish_reason) if self.output_kind == RequestOutputKind.FINAL_ONLY \ From b1b4c47c250f4975117212e8f91557689cfd3ae7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:35:36 +0000 Subject: [PATCH 29/67] minor cleanups --- vllm/v1/engine/output_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 5725a19b8dbc9..7cedee4d1c317 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -10,7 +10,6 @@ @dataclass class OutputProcessorOutput: - """Output of the OutputProcessor.step() function.""" request_outputs: List[RequestOutput] reqs_to_abort: List[str] From 2f916d14554a6af9ae87ecc0461d376c5809f4fe Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:36:48 +0000 Subject: [PATCH 30/67] clean up --- vllm/v1/metrics/stats.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 5deb50f4795bb..b1b9445d70aec 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -24,10 +24,12 @@ def __init__(self, log_stats: bool): self.num_generation_tokens = 0 self.num_prompt_tokens = 0 - def update_from_output(self, - output: "EngineCoreOutput", - is_prefilling: bool, - prompt_len: int = 0): + def update_from_output( + self, + output: "EngineCoreOutput", + is_prefilling: bool, + prompt_len: int + ): """Update the IterationStats with the EngineCoreOutput.""" if not self.log_stats: From 6a5f245312aa55ef40b0f2873e0406de8820ad77 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:37:32 +0000 Subject: [PATCH 31/67] updated --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d06455a91df88..04b050dda03fb 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -151,7 +151,7 @@ async def add_request( # 2) Add the request to AsyncLLM. queue: asyncio.Queue[RequestOutput] = asyncio.Queue() self.request_states[request_id] = RequestState.from_new_request( - tokenizer=(await self.get_tokenizer(lora_request)), + tokenizer=self.tokenizer.get_lora_tokenizer(lora_request), request=request, queue=queue, ) From 9ea36c89531b53fceec915dd9b2675773dd10d55 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:38:43 +0000 Subject: [PATCH 32/67] updated --- vllm/v1/engine/async_llm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 04b050dda03fb..edafedd0159c2 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -199,7 +199,8 @@ async def generate( # we can call __init__ before the event loop, which enables us # to handle startup failure gracefully in the OpenAI server. if self.output_handler is None: - self.output_handler = asyncio.create_task(self.step_async()) + self.output_handler = asyncio.create_task( + self._run_output_handler()) q = await self.add_request( request_id, @@ -231,7 +232,7 @@ async def generate( await self.abort(request_id) raise - async def step_async(self): + async def _run_output_handler(self): """Busy loop: Pull From EngineCore -> Process -> Push to Queues""" try: From 318c20399643c1981e4dd7b36ef0701d209fb4dc Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:39:14 +0000 Subject: [PATCH 33/67] reduce changes --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index edafedd0159c2..9c0324d6ff5cc 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -233,7 +233,7 @@ async def generate( raise async def _run_output_handler(self): - """Busy loop: Pull From EngineCore -> Process -> Push to Queues""" + """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" try: while True: From 374618326080c5dbd2a9f9fa4e5ee97e56006890 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:39:36 +0000 Subject: [PATCH 34/67] reduce LOC changes --- vllm/v1/engine/async_llm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 9c0324d6ff5cc..d400fe5856a6a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -226,8 +226,9 @@ async def generate( yield out - # If the request is disconnected by the client, the generate() - # task will be canceled. So, we abort the request if we end up here. + # If the request is disconnected by the client, the + # generate() task will be canceled. So, we abort the + # request if we end up here. except asyncio.CancelledError: await self.abort(request_id) raise From 449405b2af39c06679732709429d9f1d3a51f94e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:53:48 +0000 Subject: [PATCH 35/67] updated --- vllm/v1/engine/async_llm.py | 25 ++++------ vllm/v1/engine/output_processor.py | 77 +++++++++++++++++++++++++----- vllm/v1/engine/request_state.py | 45 ----------------- 3 files changed, 76 insertions(+), 71 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d400fe5856a6a..732056ad4342b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -75,6 +75,7 @@ def __init__( # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor( request_states=self.request_states, + tokenizer=self.tokenizer, log_stats=self.log_stats, ) @@ -138,25 +139,20 @@ async def add_request( ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" - if request_id in self.request_states: - raise ValueError(f"Request id {request_id} already running.") + # 1) Create a new output queue for the request. + queue: asyncio.Queue[RequestOutput] = asyncio.Queue() - # 1) Convert Input --> Request. + # 2) Convert Input --> Request. request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, prompt_adapter_request, priority) - # 2) Add the request to AsyncLLM. - queue: asyncio.Queue[RequestOutput] = asyncio.Queue() - self.request_states[request_id] = RequestState.from_new_request( - tokenizer=self.tokenizer.get_lora_tokenizer(lora_request), - request=request, - queue=queue, - ) + # 3) Add the request to OutputProcessor (this process). + self.output_processor.add_request(request, queue) - # 3) Add the EngineCoreRequest to EngineCore (separate process). + # 4) Add the EngineCoreRequest to EngineCore (separate process). await self.engine_core.add_request_async(request) if self.log_requests: @@ -219,7 +215,7 @@ async def generate( # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() - # Note: OutputProcessor removes from request_states. + # Note: OutputProcessor handles removal from request_states. if out.finished: yield out break @@ -241,10 +237,9 @@ async def _run_output_handler(self): # 1) Pull EngineCoreOutputs from the EngineCore. engine_core_outputs = await self.engine_core.get_output_async() - # 2) Process EngineCoreOutputs, pushing RequestOutputs into - # asyncio queues for handling by the per-req generate() task. + # 2) Process EngineCoreOutputs. processed_outputs = self.output_processor.process_outputs( - engine_core_outputs) + engine_core_outputs, self.request_states) # 3) Abort any reqs that finished due to stop strings. await self.engine_core.abort_requests_async( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 7cedee4d1c317..fe398cdf66e8d 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,10 +1,12 @@ +import asyncio from dataclasses import dataclass from typing import Dict, List, Optional +from vllm.transformers_utils.detokenizer_utils import AnyTokenizer +from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.outputs import RequestOutput -from vllm.v1.engine import EngineCoreOutputs -from vllm.v1.engine.detokenizer import DetokenizerOutput -from vllm.v1.engine.request_state import RequestState +from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequest +from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerOutput from vllm.v1.metrics.stats import IterationStats @@ -16,15 +18,65 @@ class OutputProcessorOutput: iteration_stats: IterationStats -class OutputProcessor: +class RequestState: def __init__( self, - request_states: Dict[str, RequestState], - log_stats: bool, + request_id: str, + prompt: Optional[str], + prompt_token_ids: List[int], + detokenizer: Detokenizer, + queue: Optional[asyncio.Queue[RequestOutput]], ): - self.request_states = request_states + self.request_id = request_id + self.prompt = prompt + self.prompt_token_ids = prompt_token_ids + self.prompt_len = len(prompt_token_ids) + self.detokenizer = detokenizer + self.is_prefilling = True + self.queue = queue + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: EngineCoreRequest, + queue: Optional[asyncio.Queue[RequestOutput]] = None, + ) -> "RequestState": + return cls( + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + detokenizer=Detokenizer.from_new_request( + tokenizer=tokenizer, + request=request, + ), + queue=queue, + ) + + +class OutputProcessor: + + def __init__(self, log_stats: bool, tokenizer: BaseTokenizerGroup): self.log_stats = log_stats + self.tokenizer = tokenizer + self.request_states: Dict[str, RequestState] = {} + + def add_request( + self, + request: EngineCoreRequest, + queue: Optional[asyncio.Queue[RequestOutput]] = None, + ) -> None: + request_id = request.request_id + if request_id in self.request_states: + raise ValueError(f"Request id {request_id} already running.") + + self.request_states[request_id] = RequestState.from_new_request( + tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request), + request=request, + queue=queue + ) + def make_request_output( self, @@ -50,8 +102,11 @@ def make_request_output( return request_output - def process_outputs(self, - outputs: EngineCoreOutputs) -> OutputProcessorOutput: + def process_outputs( + self, + outputs: EngineCoreOutputs, + request_states: Dict[str, RequestState], + ) -> OutputProcessorOutput: """ Process the EngineCoreOutputs: 1) Compute stats for logging @@ -84,7 +139,7 @@ def process_outputs(self, iteration_stats = IterationStats(self.log_stats) for engine_core_output in outputs.outputs: req_id = engine_core_output.request_id - req_state = self.request_states.get(req_id) + req_state = request_states.get(req_id) if req_state is None: # Ignore output for already-aborted request. continue @@ -111,7 +166,7 @@ def process_outputs(self, # Free completed requests. if request_output.finished: - self.request_states.pop(req_id) + request_states.pop(req_id) if not engine_core_output.finished: # If req not finished in EngineCore, but Detokenizer # detected stop string, abort needed in EngineCore. diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py index 572b6d977f350..e69de29bb2d1d 100644 --- a/vllm/v1/engine/request_state.py +++ b/vllm/v1/engine/request_state.py @@ -1,45 +0,0 @@ -import asyncio -from typing import List, Optional - -from vllm.outputs import RequestOutput -from vllm.transformers_utils.detokenizer_utils import AnyTokenizer -from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.detokenizer import Detokenizer - - -class RequestState: - """RequestState for AsyncLLM and LLMEngine.""" - - def __init__( - self, - request_id: str, - prompt: Optional[str], - prompt_token_ids: List[int], - detokenizer: Detokenizer, - queue: Optional[asyncio.Queue[RequestOutput]], - ): - self.request_id = request_id - self.prompt = prompt - self.prompt_token_ids = prompt_token_ids - self.prompt_len = len(prompt_token_ids) - self.detokenizer = detokenizer - self.is_prefilling = True - self.queue = queue - - @classmethod - def from_new_request( - cls, - tokenizer: AnyTokenizer, - request: EngineCoreRequest, - queue: Optional[asyncio.Queue[RequestOutput]] = None, - ) -> "RequestState": - return cls( - request_id=request.request_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, - detokenizer=Detokenizer.from_new_request( - tokenizer=tokenizer, - request=request, - ), - queue=queue, - ) From 79f2f5ff2aacfa1573d30ad2e094f07a24b4aa38 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:54:03 +0000 Subject: [PATCH 36/67] remove file --- vllm/v1/engine/request_state.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 vllm/v1/engine/request_state.py diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 From a16d27f1a674ec70320be24f1af39fe3eb235a17 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:55:28 +0000 Subject: [PATCH 37/67] updated --- vllm/v1/engine/async_llm.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 732056ad4342b..99ab070333854 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -60,9 +60,6 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() - # Request States (map of request_id -> RequestState). - self.request_states: Dict[str, RequestState] = {} - # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor( model_config=vllm_config.model_config, @@ -74,7 +71,6 @@ def __init__( # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor( - request_states=self.request_states, tokenizer=self.tokenizer, log_stats=self.log_stats, ) @@ -215,7 +211,8 @@ async def generate( # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() - # Note: OutputProcessor handles removal from request_states. + # Note: both OutputProcessor and EngineCore handle their + # own cleanup based on finished. if out.finished: yield out break From 19372f933770bc22207734bba958e504d7205784 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:55:53 +0000 Subject: [PATCH 38/67] reduce LOC changes --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 99ab070333854..ff305ad85b276 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -212,7 +212,7 @@ async def generate( out = q.get_nowait() if q.qsize() > 0 else await q.get() # Note: both OutputProcessor and EngineCore handle their - # own cleanup based on finished. + # own request cleanup based on finished. if out.finished: yield out break From 39be5038f9d471c045ddec2f0eb3a2b7c272c1fd Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:57:17 +0000 Subject: [PATCH 39/67] updated --- vllm/v1/engine/output_processor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index fe398cdf66e8d..bfb7d1230a0f5 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -77,6 +77,13 @@ def add_request( queue=queue ) + def abort_requests( + self, + request_ids: List[str], + ) -> None: + for request_id in request_ids: + self.request_states.pop(request_id, None) + def make_request_output( self, From 833f028517df84b8530ead4ea728603e9915186b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:58:22 +0000 Subject: [PATCH 40/67] updated --- vllm/v1/engine/async_llm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ff305ad85b276..515bb2e9b81d7 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -255,14 +255,12 @@ async def _run_output_handler(self): kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: - """Abort RequestId in AsyncLLM and EngineCore.""" + """Abort RequestId in OutputProcessor and EngineCore.""" request_ids = [request_id] await self.engine_core.abort_requests_async(request_ids) + self.output_processor.abort_requests(request_ids) - # If a request finishes while we await then the request_id - # will be removed from the tracked queues before we get here. - _ = self.request_states.pop(request_id, None) def _log_stats( self, From ef2c3f9e18988f444a88dbd85cb65b3d68ce9b56 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 21:59:41 +0000 Subject: [PATCH 41/67] updated --- vllm/v1/engine/async_llm.py | 9 +++------ vllm/v1/engine/output_processor.py | 5 ++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 515bb2e9b81d7..c5ea2c66574ee 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -235,19 +235,17 @@ async def _run_output_handler(self): engine_core_outputs = await self.engine_core.get_output_async() # 2) Process EngineCoreOutputs. - processed_outputs = self.output_processor.process_outputs( - engine_core_outputs, self.request_states) + outputs = self.output_processor.process_outputs(engine_core_outputs) # 3) Abort any reqs that finished due to stop strings. - await self.engine_core.abort_requests_async( - processed_outputs.reqs_to_abort) + await self.engine_core.abort_requests_async(outputs.reqs_to_abort) # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once we add Prometheus. self._log_stats( scheduler_stats=engine_core_outputs.scheduler_stats, - iteration_stats=processed_outputs.iteration_stats, + iteration_stats=outputs.iteration_stats, ) except Exception as e: @@ -261,7 +259,6 @@ async def abort(self, request_id: str) -> None: await self.engine_core.abort_requests_async(request_ids) self.output_processor.abort_requests(request_ids) - def _log_stats( self, scheduler_stats: SchedulerStats, diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index bfb7d1230a0f5..16ffd1a937b55 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -112,7 +112,6 @@ def make_request_output( def process_outputs( self, outputs: EngineCoreOutputs, - request_states: Dict[str, RequestState], ) -> OutputProcessorOutput: """ Process the EngineCoreOutputs: @@ -146,7 +145,7 @@ def process_outputs( iteration_stats = IterationStats(self.log_stats) for engine_core_output in outputs.outputs: req_id = engine_core_output.request_id - req_state = request_states.get(req_id) + req_state = self.request_states.get(req_id) if req_state is None: # Ignore output for already-aborted request. continue @@ -173,7 +172,7 @@ def process_outputs( # Free completed requests. if request_output.finished: - request_states.pop(req_id) + self.request_states.pop(req_id) if not engine_core_output.finished: # If req not finished in EngineCore, but Detokenizer # detected stop string, abort needed in EngineCore. From 33303fc298a028c8ad6551eed3bfe800978f59aa Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:01:57 +0000 Subject: [PATCH 42/67] updated --- vllm/v1/engine/async_llm.py | 11 ++++++----- vllm/v1/engine/output_processor.py | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c5ea2c66574ee..88b42e76a844b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -232,20 +232,21 @@ async def _run_output_handler(self): try: while True: # 1) Pull EngineCoreOutputs from the EngineCore. - engine_core_outputs = await self.engine_core.get_output_async() + outputs = await self.engine_core.get_output_async() # 2) Process EngineCoreOutputs. - outputs = self.output_processor.process_outputs(engine_core_outputs) + processed_outputs = self.output_processor.process_outputs( + outputs.outputs) # 3) Abort any reqs that finished due to stop strings. - await self.engine_core.abort_requests_async(outputs.reqs_to_abort) + await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort) # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once we add Prometheus. self._log_stats( - scheduler_stats=engine_core_outputs.scheduler_stats, - iteration_stats=outputs.iteration_stats, + scheduler_stats=outputs.scheduler_stats, + iteration_stats=processed_outputs.iteration_stats, ) except Exception as e: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 16ffd1a937b55..0ea79f412981f 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -5,7 +5,7 @@ from vllm.transformers_utils.detokenizer_utils import AnyTokenizer from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.outputs import RequestOutput -from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequest +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerOutput from vllm.v1.metrics.stats import IterationStats @@ -111,7 +111,7 @@ def make_request_output( def process_outputs( self, - outputs: EngineCoreOutputs, + engine_core_outputs: List[EngineCoreOutput], ) -> OutputProcessorOutput: """ Process the EngineCoreOutputs: @@ -143,7 +143,7 @@ def process_outputs( request_outputs: List[RequestOutput] = [] reqs_to_abort: List[str] = [] iteration_stats = IterationStats(self.log_stats) - for engine_core_output in outputs.outputs: + for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id req_state = self.request_states.get(req_id) if req_state is None: From edae5d2c01b1b396ab6c7f0b8e17db179acb5fc3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:07:06 +0000 Subject: [PATCH 43/67] updated --- tests/v1/engine/test_async_llm.py | 35 +++++++++++++++--------------- vllm/v1/engine/async_llm.py | 10 ++++----- vllm/v1/engine/detokenizer.py | 6 ++--- vllm/v1/engine/llm_engine.py | 15 ++++--------- vllm/v1/engine/output_processor.py | 6 ++--- vllm/v1/metrics/stats.py | 10 ++------- 6 files changed, 32 insertions(+), 50 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index a9fa8152ed87d..ab04731e3bc1b 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -17,17 +17,15 @@ disable_log_requests=True) -async def run_example( - engine: AsyncLLM, - request_id: str, - num_tokens: int, - abort_after: int = 0 -) -> Tuple[int, int, str]: - - generator = engine.generate( - request_id=request_id, - prompt="Hello my name is Robert and", - sampling_params=SamplingParams(max_tokens=num_tokens, temperature=0)) +async def run_example(engine: AsyncLLM, + request_id: str, + num_tokens: int, + abort_after: int = 0) -> Tuple[int, int, str]: + + generator = engine.generate(request_id=request_id, + prompt="Hello my name is Robert and", + sampling_params=SamplingParams( + max_tokens=num_tokens, temperature=0)) count = 0 try: @@ -42,7 +40,7 @@ async def run_example( print(f"{request_id=}") assert request_id not in engine.request_states finally: - + expected_count = num_tokens if abort_after == 0 else abort_after return count, expected_count, request_id @@ -67,12 +65,13 @@ async def test_load(monkeypatch): # Create concurrent requests. tasks = [ - asyncio.create_task(run_example( - engine=engine, - request_id=request_id, - num_tokens=NUM_EXPECTED_TOKENS, - abort_after=(ABORT_AFTER if idx % ABORT_RATE == 0 else 0) - )) for idx, request_id in enumerate(request_ids) + asyncio.create_task( + run_example(engine=engine, + request_id=request_id, + num_tokens=NUM_EXPECTED_TOKENS, + abort_after=(ABORT_AFTER if idx % + ABORT_RATE == 0 else 0))) + for idx, request_id in enumerate(request_ids) ] # Confirm that we got all the EXPECTED tokens from the requests. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 88b42e76a844b..39306163ad18f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -20,7 +20,6 @@ from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor -from vllm.v1.engine.request_state import RequestState from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase from vllm.v1.metrics.stats import IterationStats, SchedulerStats @@ -70,10 +69,8 @@ def __init__( ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). - self.output_processor = OutputProcessor( - tokenizer=self.tokenizer, - log_stats=self.log_stats, - ) + self.output_processor = OutputProcessor(self.tokenizer, + log_stats=self.log_stats) # EngineCore (starts the engine in background process). self.engine_core = EngineCoreClient.make_client( @@ -239,7 +236,8 @@ async def _run_output_handler(self): outputs.outputs) # 3) Abort any reqs that finished due to stop strings. - await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort) + await self.engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) # 4) Logging. # TODO(rob): make into a coroutine and launch it in diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 8d9ca1c805ecc..2bbbe7cfd89ce 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -10,6 +10,7 @@ logger = init_logger(__name__) + @dataclass class DetokenizerOutput: output_text: str @@ -159,9 +160,8 @@ def update_from_output( output_text = self._get_next_output_text(finished, delta) token_ids = new_token_ids if delta else self.output_token_ids - return DetokenizerOutput( - output_text, token_ids, finished, finish_reason, stop_reason) - + return DetokenizerOutput(output_text, token_ids, finished, + finish_reason, stop_reason) def _get_next_output_text(self, finished: bool, delta: bool) -> str: """If delta is True, only new text since the last call to diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c272b1b483ab..145439067253e 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -53,9 +53,6 @@ def __init__( lora_config=vllm_config.lora_config) self.tokenizer.ping() - # Request States (map of request_id -> RequestState). - self.request_states: Dict[str, RequestState] = {} - # Processor (convert Inputs --> EngineCoreRequests) self.processor = Processor(model_config=vllm_config.model_config, cache_config=vllm_config.cache_config, @@ -65,10 +62,8 @@ def __init__( mm_registry=mm_registry) # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). - self.output_processor = OutputProcessor( - request_states=self.request_states, - log_stats=False, - ) + self.output_processor = OutputProcessor(self.tokenizer, + log_stats=False) # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( @@ -126,9 +121,6 @@ def add_request( priority: int = 0, ) -> None: - if request_id in self.request_states: - raise ValueError(f"Request id {request_id} already running.") - # 1) Convert Input --> Request. request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, @@ -138,7 +130,8 @@ def add_request( # 2) Make a new RequestState and queue. self.request_states[request_id] = RequestState.from_new_request( - tokenizer=self.get_tokenizer_group().get_lora_tokenizer(lora_request), + tokenizer=self.get_tokenizer_group().get_lora_tokenizer( + lora_request), request=request, ) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 0ea79f412981f..fef45138bf951 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -61,7 +61,7 @@ def __init__(self, log_stats: bool, tokenizer: BaseTokenizerGroup): self.log_stats = log_stats self.tokenizer = tokenizer self.request_states: Dict[str, RequestState] = {} - + def add_request( self, request: EngineCoreRequest, @@ -74,8 +74,7 @@ def add_request( self.request_states[request_id] = RequestState.from_new_request( tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request), request=request, - queue=queue - ) + queue=queue) def abort_requests( self, @@ -83,7 +82,6 @@ def abort_requests( ) -> None: for request_id in request_ids: self.request_states.pop(request_id, None) - def make_request_output( self, diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index b1b9445d70aec..062f419f806bf 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -24,14 +24,8 @@ def __init__(self, log_stats: bool): self.num_generation_tokens = 0 self.num_prompt_tokens = 0 - def update_from_output( - self, - output: "EngineCoreOutput", - is_prefilling: bool, - prompt_len: int - ): - """Update the IterationStats with the EngineCoreOutput.""" - + def update_from_output(self, output: "EngineCoreOutput", + is_prefilling: bool, prompt_len: int): if not self.log_stats: return From a20c7b50c1b6859efdbd18a927698e6526da858b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:08:37 +0000 Subject: [PATCH 44/67] updated --- vllm/v1/engine/llm_engine.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 145439067253e..3a0265e254dcf 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -109,6 +109,12 @@ def has_unfinished_requests(self) -> bool: def validate_outputs(cls, outputs, output_type): return outputs + def abort_request(self, request_ids: List[str]) -> None: + """Remove request_ids from EngineCore and Detokenizer.""" + + self.engine_core.abort_requests(request_ids) + self.output_processor.abort_requests(request_ids) + def add_request( self, request_id: str, @@ -121,7 +127,7 @@ def add_request( priority: int = 0, ) -> None: - # 1) Convert Input --> Request. + # 1) Process raw inputs into the request. request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, @@ -129,11 +135,7 @@ def add_request( priority) # 2) Make a new RequestState and queue. - self.request_states[request_id] = RequestState.from_new_request( - tokenizer=self.get_tokenizer_group().get_lora_tokenizer( - lora_request), - request=request, - ) + self.output_processor.add_request(request) # 3) Add the request to EngineCore. self.engine_core.add_request(request) From b7e5a91495aa79a76b7feec453af6d449b91559a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:09:17 +0000 Subject: [PATCH 45/67] updated --- vllm/v1/engine/llm_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 3a0265e254dcf..de2b2599281cb 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -109,10 +109,10 @@ def has_unfinished_requests(self) -> bool: def validate_outputs(cls, outputs, output_type): return outputs - def abort_request(self, request_ids: List[str]) -> None: - """Remove request_ids from EngineCore and Detokenizer.""" + def abort_request(self, request_ids: List[str]) -> None: + """Remove request_ids from EngineCore and Detokenizer.""" - self.engine_core.abort_requests(request_ids) + self.engine_core.abort_requests(request_ids) self.output_processor.abort_requests(request_ids) def add_request( From 93530109539327b7f4e4705698b639c80913c5e4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:12:33 +0000 Subject: [PATCH 46/67] fixed --- vllm/v1/engine/llm_engine.py | 6 +-- vllm/v1/engine/output_processor.py | 77 +++++++++++++++++------------- 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index de2b2599281cb..efa249d6c512b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -100,10 +100,10 @@ def from_engine_args( multiprocess_mode=enable_multiprocessing) def get_num_unfinished_requests(self) -> int: - return len(self.request_states) + return self.output_processor.get_num_unfinished_requests() def has_unfinished_requests(self) -> bool: - return self.get_num_unfinished_requests() > 0 + return self.output_processor.has_unfinished_requests() @classmethod def validate_outputs(cls, outputs, output_type): @@ -148,7 +148,7 @@ def step(self) -> List[RequestOutput]: # 2) Process EngineCoreOutputs. processed_outputs = self.output_processor.process_outputs( - engine_core_outputs) + engine_core_outputs.output) # 3) Abort any reqs that finished due to stop strings. self.engine_core.abort_requests(processed_outputs.reqs_to_abort) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index fef45138bf951..7d1942554db77 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -56,12 +56,30 @@ def from_new_request( class OutputProcessor: + """Process EngineCoreOutputs into RequestOutputs.""" - def __init__(self, log_stats: bool, tokenizer: BaseTokenizerGroup): + def __init__( + self, + tokenizer: BaseTokenizerGroup, + log_stats: bool, + ): self.log_stats = log_stats self.tokenizer = tokenizer self.request_states: Dict[str, RequestState] = {} + def get_num_unfinished_requests(self): + return len(self.request_states) + + def has_unfinished_requests(self) -> bool: + return len(self.request_states) > 0 + + def abort_requests( + self, + request_ids: List[str], + ) -> None: + for request_id in request_ids: + self.request_states.pop(request_id, None) + def add_request( self, request: EngineCoreRequest, @@ -76,37 +94,6 @@ def add_request( request=request, queue=queue) - def abort_requests( - self, - request_ids: List[str], - ) -> None: - for request_id in request_ids: - self.request_states.pop(request_id, None) - - def make_request_output( - self, - request_state: RequestState, - detokenizer_output: Optional[DetokenizerOutput], - ) -> Optional[RequestOutput]: - - if detokenizer_output is None: - return None - - request_output = RequestOutput.new( - request_state.request_id, - request_state.prompt, - request_state.prompt_token_ids, - detokenizer_output.output_text, - detokenizer_output.token_ids, - detokenizer_output.finished, - ) - if detokenizer_output.finished: - completion_output = request_output.outputs[0] - completion_output.finish_reason = detokenizer_output.finish_reason - completion_output.stop_reason = detokenizer_output.stop_reason - - return request_output - def process_outputs( self, engine_core_outputs: List[EngineCoreOutput], @@ -159,7 +146,7 @@ def process_outputs( engine_core_output) # 3) Create and handle RequestOutput objects. - if request_output := self.make_request_output( + if request_output := self._make_request_output( req_state, detokenizer_output): if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). @@ -181,3 +168,27 @@ def process_outputs( reqs_to_abort=reqs_to_abort, iteration_stats=iteration_stats, ) + + def _make_request_output( + self, + request_state: RequestState, + detokenizer_output: Optional[DetokenizerOutput], + ) -> Optional[RequestOutput]: + + if detokenizer_output is None: + return None + + request_output = RequestOutput.new( + request_state.request_id, + request_state.prompt, + request_state.prompt_token_ids, + detokenizer_output.output_text, + detokenizer_output.token_ids, + detokenizer_output.finished, + ) + if detokenizer_output.finished: + completion_output = request_output.outputs[0] + completion_output.finish_reason = detokenizer_output.finish_reason + completion_output.stop_reason = detokenizer_output.stop_reason + + return request_output From 94de9f52102e4b78910f805e17b2c348c3c5e878 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:13:39 +0000 Subject: [PATCH 47/67] cleanup --- vllm/v1/engine/llm_engine.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index efa249d6c512b..85f2cef4f34d9 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -141,14 +141,13 @@ def add_request( self.engine_core.add_request(request) def step(self) -> List[RequestOutput]: - """Pull From EngineCore -> Process -> Return RequestOutput.""" - # 1) Pull EngineCoreOutput from the EngineCore. - engine_core_outputs = self.engine_core.get_output() + # 1) Get EngineCoreOutput from the EngineCore. + outputs = self.engine_core.get_output() # 2) Process EngineCoreOutputs. processed_outputs = self.output_processor.process_outputs( - engine_core_outputs.output) + outputs.output) # 3) Abort any reqs that finished due to stop strings. self.engine_core.abort_requests(processed_outputs.reqs_to_abort) From 2ea4283f93b43741b097a316409263780c5198ce Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:19:57 +0000 Subject: [PATCH 48/67] revert abort test --- tests/v1/engine/test_async_llm.py | 58 ++++++++++-------------------- vllm/v1/engine/async_llm.py | 4 ++- vllm/v1/engine/detokenizer.py | 4 +-- vllm/v1/engine/llm_engine.py | 5 ++- vllm/v1/engine/output_processor.py | 6 ++-- 5 files changed, 28 insertions(+), 49 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index ab04731e3bc1b..fffb5b8100ec7 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -13,36 +13,21 @@ allow_module_level=True) ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", - enforce_eager=True, disable_log_requests=True) -async def run_example(engine: AsyncLLM, - request_id: str, - num_tokens: int, - abort_after: int = 0) -> Tuple[int, int, str]: +async def generate(engine: AsyncLLM, request_id: str, + max_tokens: int) -> Tuple[int, str]: + count = 0 + async for _ in engine.generate(request_id=request_id, + prompt="Hello my name is Robert and", + sampling_params=SamplingParams( + max_tokens=max_tokens, temperature=0)): - generator = engine.generate(request_id=request_id, - prompt="Hello my name is Robert and", - sampling_params=SamplingParams( - max_tokens=num_tokens, temperature=0)) + count += 1 + await asyncio.sleep(0.) - count = 0 - try: - async for _ in generator(): - count += 1 - print(f"{request_id=}, {count=}, {abort_after=}") - if count == abort_after: - # Simulate request cancellation. - print(f"{request_id=}") - asyncio.current_task().cancel() - except asyncio.CancelledError: - print(f"{request_id=}") - assert request_id not in engine.request_states - finally: - - expected_count = num_tokens if abort_after == 0 else abort_after - return count, expected_count, request_id + return count, request_id @pytest.mark.asyncio @@ -55,31 +40,24 @@ async def test_load(monkeypatch): engine = AsyncLLM.from_engine_args(ENGINE_ARGS) - NUM_REQUESTS = 100 + NUM_REQUESTS = 10000 NUM_EXPECTED_TOKENS = 10 - # Abort 1/100 requests after 5 tokens. - ABORT_RATE = 100 - ABORT_AFTER = 5 request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] # Create concurrent requests. - tasks = [ - asyncio.create_task( - run_example(engine=engine, - request_id=request_id, - num_tokens=NUM_EXPECTED_TOKENS, - abort_after=(ABORT_AFTER if idx % - ABORT_RATE == 0 else 0))) - for idx, request_id in enumerate(request_ids) - ] + tasks = [] + for request_id in request_ids: + tasks.append( + asyncio.create_task( + generate(engine, request_id, NUM_EXPECTED_TOKENS))) # Confirm that we got all the EXPECTED tokens from the requests. failed_request_id = None tokens = None for task in tasks: - num_generated_tokens, expected_tokens, request_id = await task - if (num_generated_tokens != expected_tokens + num_generated_tokens, request_id = await task + if (num_generated_tokens != NUM_EXPECTED_TOKENS and failed_request_id is None): failed_request_id = request_id tokens = num_generated_tokens diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 39306163ad18f..f1bdce2c5e474 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,6 +1,6 @@ import asyncio import os -from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union +from typing import AsyncGenerator, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs @@ -133,6 +133,8 @@ async def add_request( """Add new request to the AsyncLLM.""" # 1) Create a new output queue for the request. + if request_id in self.output_processor.request_states: + raise ValueError(f"Request id {request_id} already running.") queue: asyncio.Queue[RequestOutput] = asyncio.Queue() # 2) Convert Input --> Request. diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 2bbbe7cfd89ce..4a8b61beec037 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -21,7 +21,7 @@ class DetokenizerOutput: @dataclass -class Detokenizer: +class IncrementalDetokenizer: # Generation data output_text: str @@ -58,7 +58,7 @@ def from_new_request( cls, tokenizer: AnyTokenizer, request: EngineCoreRequest, - ) -> "Detokenizer": + ) -> "IncrementalDetokenizer": tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( tokenizer=tokenizer, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 85f2cef4f34d9..f5999ccda6447 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -15,12 +15,11 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import ( - AnyTokenizer, BaseTokenizerGroup, init_tokenizer_from_configs) + BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor -from vllm.v1.engine.request_state import RequestState from vllm.v1.executor.abstract import Executor logger = init_logger(__name__) @@ -147,7 +146,7 @@ def step(self) -> List[RequestOutput]: # 2) Process EngineCoreOutputs. processed_outputs = self.output_processor.process_outputs( - outputs.output) + outputs.outputs) # 3) Abort any reqs that finished due to stop strings. self.engine_core.abort_requests(processed_outputs.reqs_to_abort) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 7d1942554db77..b89d4862e1ef8 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -6,7 +6,7 @@ from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.outputs import RequestOutput from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest -from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerOutput +from vllm.v1.engine.detokenizer import IncrementalDetokenizer, DetokenizerOutput from vllm.v1.metrics.stats import IterationStats @@ -25,7 +25,7 @@ def __init__( request_id: str, prompt: Optional[str], prompt_token_ids: List[int], - detokenizer: Detokenizer, + detokenizer: IncrementalDetokenizer, queue: Optional[asyncio.Queue[RequestOutput]], ): self.request_id = request_id @@ -47,7 +47,7 @@ def from_new_request( request_id=request.request_id, prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, - detokenizer=Detokenizer.from_new_request( + detokenizer=IncrementalDetokenizer.from_new_request( tokenizer=tokenizer, request=request, ), From b9683d170efc542fe175eda95955f6bfee438e26 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:31:00 +0000 Subject: [PATCH 49/67] updared --- vllm/v1/engine/async_llm.py | 4 +++- vllm/v1/engine/output_processor.py | 8 ++++++-- vllm/v1/metrics/stats.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f1bdce2c5e474..b86fbc3925fe9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -133,7 +133,7 @@ async def add_request( """Add new request to the AsyncLLM.""" # 1) Create a new output queue for the request. - if request_id in self.output_processor.request_states: + if self.output_processor.is_request_active(request_id): raise ValueError(f"Request id {request_id} already running.") queue: asyncio.Queue[RequestOutput] = asyncio.Queue() @@ -236,6 +236,8 @@ async def _run_output_handler(self): # 2) Process EngineCoreOutputs. processed_outputs = self.output_processor.process_outputs( outputs.outputs) + # NOTE: RequestOutputs are pushed to their queues. + assert len(processed_outputs.request_outputs) == 0 # 3) Abort any reqs that finished due to stop strings. await self.engine_core.abort_requests_async( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index b89d4862e1ef8..2e43f45171f3e 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -2,11 +2,12 @@ from dataclasses import dataclass from typing import Dict, List, Optional +from vllm.outputs import RequestOutput from vllm.transformers_utils.detokenizer_utils import AnyTokenizer from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.outputs import RequestOutput from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest -from vllm.v1.engine.detokenizer import IncrementalDetokenizer, DetokenizerOutput +from vllm.v1.engine.detokenizer import (DetokenizerOutput, + IncrementalDetokenizer) from vllm.v1.metrics.stats import IterationStats @@ -67,6 +68,9 @@ def __init__( self.tokenizer = tokenizer self.request_states: Dict[str, RequestState] = {} + def is_request_active(self, request_id: str) -> bool: + return request_id in self.request_states + def get_num_unfinished_requests(self): return len(self.request_states) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 062f419f806bf..ac94b32fdd5f6 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,5 +1,5 @@ -from typing import TYPE_CHECKING from dataclasses import dataclass +from typing import TYPE_CHECKING if TYPE_CHECKING: from vllm.v1.engine import EngineCoreOutput From 92c3b0c46bb18aa015f983191c3e45fc579a268f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 22:48:23 +0000 Subject: [PATCH 50/67] stash --- vllm/v1/engine/async_llm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b86fbc3925fe9..659452bbe0ec4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -222,6 +222,7 @@ async def generate( # generate() task will be canceled. So, we abort the # request if we end up here. except asyncio.CancelledError: + print("CANCELED!") await self.abort(request_id) raise @@ -262,6 +263,9 @@ async def abort(self, request_id: str) -> None: await self.engine_core.abort_requests_async(request_ids) self.output_processor.abort_requests(request_ids) + if self.log_requests: + logger.info("Aborted request %s.", request_id) + def _log_stats( self, scheduler_stats: SchedulerStats, From a985a73bbd5aa88639d945a8889ce8b0fb44139e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 23:00:07 +0000 Subject: [PATCH 51/67] added logging and comment --- vllm/v1/engine/async_llm.py | 1 - vllm/v1/engine/output_processor.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 659452bbe0ec4..a74699f7513e6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -222,7 +222,6 @@ async def generate( # generate() task will be canceled. So, we abort the # request if we end up here. except asyncio.CancelledError: - print("CANCELED!") await self.abort(request_id) raise diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 2e43f45171f3e..749f4f5043c97 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -126,6 +126,8 @@ def process_outputs( * IterationStats.update_from_output() * Detokenizer.update_from_output() + TODO(rob): add Protocol makes update_from_output explicit. + ********************************************************** """ From 6c36d87e1049979beb6ca3b4f6e89c15a38ace09 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 23:08:47 +0000 Subject: [PATCH 52/67] starting to fix tests - stash --- tests/v1/engine/test_detokenizer.py | 218 ---------------------------- 1 file changed, 218 deletions(-) delete mode 100644 tests/v1/engine/test_detokenizer.py diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py deleted file mode 100644 index aeae697ca32b0..0000000000000 --- a/tests/v1/engine/test_detokenizer.py +++ /dev/null @@ -1,218 +0,0 @@ -from typing import List - -import pytest -from transformers import AutoTokenizer - -from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest -from vllm.v1.engine.detokenizer import Detokenizer - -TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" -tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) - -FULL_STRINGS = [ - "My name is Robert from Neural Magic and I love working on vLLM so much!", - "Red Hat is the best open source company by far across Linux, K8s, and AI.", - "Nick is the name of my brother in addition to my colleague from Red Hat.", -] - -STOP_STRINGS = ["I love working on", "company by far", "brother in"] - -FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS] -PROMPT_LEN = 5 -PROMPT_TOKENS = [ - tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS -] -GENERATION_TOKENS = [ - tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS -] -PROMPT_STRINGS = [ - tokenizer.decode(prompt_tokens, skip_special_tokens=True) - for prompt_tokens in PROMPT_TOKENS -] -PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS] -GENERATION_STRINGS = [ - text[prompt_len:] - for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN) -] - - -class MockEngineCore: - """Mock outputs form premade tokens lists.""" - - def __init__(self, tokens_list: List[List[int]]): - self.tokens_list = tokens_list - self.current_idx = 0 - - def get_outputs(self) -> List[EngineCoreOutput]: - token_idx = self.current_idx - self.current_idx += 1 - - outputs = [] - for req_idx, token_ids in enumerate(self.tokens_list): - if len(token_ids) > token_idx: - output = EngineCoreOutput(request_id=f"request-{req_idx}", - new_token_ids=[token_ids[token_idx]], - finished=False) - if token_idx == len(token_ids) - 1: - output.finished = True - output.finish_reason = "stopped" - outputs.append(output) - - return outputs - - -@pytest.mark.parametrize( - "request_output_kind", - [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) -def test_incremental_detokenization(request_output_kind: RequestOutputKind): - detokenizer = Detokenizer(TOKENIZER_NAME) - engine_core = MockEngineCore(GENERATION_TOKENS) - - # Make N requests. - requests = [ - EngineCoreRequest(request_id=f"request-{idx}", - prompt=prompt, - prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_inputs=None, - mm_hashes=None, - mm_placeholders=None, - eos_token_id=None, - lora_request=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=request_output_kind, - stop=[], - include_stop_str_in_output=False)) - for idx, ( - prompt, - prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) - ] - - # Add requests to the detokenizer. - for request in requests: - detokenizer.add_request(request) - - gen_strings = {} - gen_tokens = {} - while True: - # Mock output from the EngineCore. - outputs = engine_core.get_outputs() - if len(outputs) == 0: - break - - # Step the Detokenizer. - request_outputs, requests_to_abort = detokenizer.step(outputs) - assert len(requests_to_abort) == 0 - - # Update tracking. - for request_output in request_outputs: - request_id = request_output.request_id - new_text = request_output.outputs[0].text - new_tokens = request_output.outputs[0].token_ids - if request_id not in gen_strings: - gen_strings[request_id] = new_text - gen_tokens[request_id] = new_tokens - else: - gen_strings[request_id] += new_text - gen_tokens[request_id].extend(new_tokens) - - # Confirmed tracked values matches what we expected. - for idx, (ref_gen_str, ref_gen_toks) in enumerate( - zip(GENERATION_STRINGS, GENERATION_TOKENS)): - gen_str = gen_strings[f"request-{idx}"] - gen_toks = gen_tokens[f"request-{idx}"] - - assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" - assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" - - assert detokenizer.get_num_unfinished_requests() == 0 - assert not detokenizer.has_unfinished_requests() - - -@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) -def test_stop_string(include_stop_str_in_output: bool): - detokenizer = Detokenizer(TOKENIZER_NAME) - engine_core = MockEngineCore(GENERATION_TOKENS) - - # Make N requests. - requests = [ - EngineCoreRequest( - request_id=f"request-{idx}", - prompt=prompt, - prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_inputs=None, - mm_hashes=None, - mm_placeholders=None, - eos_token_id=None, - lora_request=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=STOP_STRINGS, - include_stop_str_in_output=include_stop_str_in_output, - )) for idx, ( - prompt, - prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) - ] - - # Add requests to the detokenizer. - for request in requests: - detokenizer.add_request(request) - - gen_strings = {} - aborted = [] - while True: - # Mock output from the EngineCore. - outputs = engine_core.get_outputs() - if len(outputs) == 0: - break - - # Step the Detokenizer. - request_outputs, requests_to_abort = detokenizer.step(outputs) - for request_output in request_outputs: - # If aborted, we should not get a request output. - assert request_output.request_id not in aborted - aborted.extend(requests_to_abort) - - # Update tracking. - for request_output in request_outputs: - if request_output.finished: - assert request_output.outputs[0].finish_reason == "stop" - - request_id = request_output.request_id - new_text = request_output.outputs[0].text - if request_id not in gen_strings: - gen_strings[request_id] = new_text - else: - gen_strings[request_id] += new_text - - # Confirmed tracked values matches what we expected. - for idx, (ref_gen_str, - stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)): - - # Request should be aborted. - request_id = f"request-{idx}" - assert request_id in aborted - - # Collected values that were generated. - gen_str = gen_strings[request_id] - - # Construct reference strings. - stop_str_idx = ref_gen_str.find(stop_str) - ref_str_exc_stop = ref_gen_str[:stop_str_idx] - ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str - - if include_stop_str_in_output: - assert gen_str == ref_str_inc_stop, ( - f"{gen_str=}, {ref_str_inc_stop=}") - else: - assert gen_str == ref_str_exc_stop, ( - f"{gen_str=}, {ref_str_exc_stop=}") - - assert detokenizer.get_num_unfinished_requests() == 0 - assert not detokenizer.has_unfinished_requests() From 595fd122c31032c3aa03d16a86c5332246132461 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 23:36:06 +0000 Subject: [PATCH 53/67] updated tests --- tests/v1/engine/test_output_processor.py | 222 +++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 tests/v1/engine/test_output_processor.py diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py new file mode 100644 index 0000000000000..c0c335d03b50a --- /dev/null +++ b/tests/v1/engine/test_output_processor.py @@ -0,0 +1,222 @@ +from typing import List + +import pytest +from transformers import AutoTokenizer + +from vllm.config import VllmConfig +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine.output_processor import OutputProcessor + +TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" +tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) + +FULL_STRINGS = [ + "My name is Robert from Neural Magic and I love working on vLLM so much!", + "Red Hat is the best open source company by far across Linux, K8s, and AI.", + "Nick is the name of my brother in addition to my colleague from Red Hat.", +] + +STOP_STRINGS = ["I love working on", "company by far", "brother in"] + +FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS] +PROMPT_LEN = 5 +PROMPT_TOKENS = [ + tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS +] +GENERATION_TOKENS = [ + tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS +] +PROMPT_STRINGS = [ + tokenizer.decode(prompt_tokens, skip_special_tokens=True) + for prompt_tokens in PROMPT_TOKENS +] +PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS] +GENERATION_STRINGS = [ + text[prompt_len:] + for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN) +] + + +class MockEngineCore: + """Mock outputs form premade tokens lists.""" + + def __init__(self, tokens_list: List[List[int]]): + self.tokens_list = tokens_list + self.current_idx = 0 + + def get_outputs(self) -> List[EngineCoreOutput]: + token_idx = self.current_idx + self.current_idx += 1 + + outputs = [] + for req_idx, token_ids in enumerate(self.tokens_list): + if len(token_ids) > token_idx: + output = EngineCoreOutput(request_id=f"request-{req_idx}", + new_token_ids=[token_ids[token_idx]], + finished=False) + if token_idx == len(token_ids) - 1: + output.finished = True + output.finish_reason = "stopped" + outputs.append(output) + + return outputs + + +@pytest.mark.parametrize( + "request_output_kind", + [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +def test_incremental_detokenization(request_output_kind: RequestOutputKind): + output_processor = OutputProcessor(TOKENIZER_NAME, log_stats=False) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + EngineCoreRequest(request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=request_output_kind, + stop=[], + include_stop_str_in_output=False)) + for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add requests to the detokenizer. + for request in requests: + output_processor.add_request(request) + + gen_strings = {} + gen_tokens = {} + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + processed_outputs = output_processor.process_outputs(outputs,) + request_outputs = processed_outputs.request_outputs + requests_to_abort = processed_outputs.reqs_to_abort + assert len(requests_to_abort) == 0 + + # Update tracking. + for request_output in request_outputs: + request_id = request_output.request_id + new_text = request_output.outputs[0].text + new_tokens = request_output.outputs[0].token_ids + if request_id not in gen_strings: + gen_strings[request_id] = new_text + gen_tokens[request_id] = new_tokens + else: + gen_strings[request_id] += new_text + gen_tokens[request_id].extend(new_tokens) + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, ref_gen_toks) in enumerate( + zip(GENERATION_STRINGS, GENERATION_TOKENS)): + gen_str = gen_strings[f"request-{idx}"] + gen_toks = gen_tokens[f"request-{idx}"] + + assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" + assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" + + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() + + +@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) +def test_stop_string(include_stop_str_in_output: bool): + detokenizer = OutputProcessor(TOKENIZER_NAME, log_stats=False) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + EngineCoreRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=STOP_STRINGS, + include_stop_str_in_output=include_stop_str_in_output, + )) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add requests to the detokenizer. + for request in requests: + detokenizer.add_request(request) + + gen_strings = {} + aborted = [] + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + request_outputs, requests_to_abort = detokenizer.step(outputs) + for request_output in request_outputs: + # If aborted, we should not get a request output. + assert request_output.request_id not in aborted + aborted.extend(requests_to_abort) + + # Update tracking. + for request_output in request_outputs: + if request_output.finished: + assert request_output.outputs[0].finish_reason == "stop" + + request_id = request_output.request_id + new_text = request_output.outputs[0].text + if request_id not in gen_strings: + gen_strings[request_id] = new_text + else: + gen_strings[request_id] += new_text + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, + stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)): + + # Request should be aborted. + request_id = f"request-{idx}" + assert request_id in aborted + + # Collected values that were generated. + gen_str = gen_strings[request_id] + + # Construct reference strings. + stop_str_idx = ref_gen_str.find(stop_str) + ref_str_exc_stop = ref_gen_str[:stop_str_idx] + ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str + + if include_stop_str_in_output: + assert gen_str == ref_str_inc_stop, ( + f"{gen_str=}, {ref_str_inc_stop=}") + else: + assert gen_str == ref_str_exc_stop, ( + f"{gen_str=}, {ref_str_exc_stop=}") + + assert detokenizer.get_num_unfinished_requests() == 0 + assert not detokenizer.has_unfinished_requests() From 5ecfe8e03410b3717b2fcd2a610277f2559b1a22 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 23:41:28 +0000 Subject: [PATCH 54/67] make tests pass --- tests/v1/engine/test_output_processor.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index c0c335d03b50a..3fcd796a5ba4e 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -3,15 +3,20 @@ import pytest from transformers import AutoTokenizer -from vllm.config import VllmConfig +from vllm.engine.arg_utils import EngineArgs from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest from vllm.v1.engine.output_processor import OutputProcessor TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" +VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config() +TOKENIZER_GROUP = init_tokenizer_from_configs( + VLLM_CONFIG.model_config, VLLM_CONFIG.scheduler_config, + VLLM_CONFIG.parallel_config, VLLM_CONFIG.lora_config) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) + FULL_STRINGS = [ "My name is Robert from Neural Magic and I love working on vLLM so much!", "Red Hat is the best open source company by far across Linux, K8s, and AI.", @@ -68,7 +73,7 @@ def get_outputs(self) -> List[EngineCoreOutput]: "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) def test_incremental_detokenization(request_output_kind: RequestOutputKind): - output_processor = OutputProcessor(TOKENIZER_NAME, log_stats=False) + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) engine_core = MockEngineCore(GENERATION_TOKENS) # Make N requests. @@ -138,7 +143,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): @pytest.mark.parametrize("include_stop_str_in_output", [True, False]) def test_stop_string(include_stop_str_in_output: bool): - detokenizer = OutputProcessor(TOKENIZER_NAME, log_stats=False) + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) engine_core = MockEngineCore(GENERATION_TOKENS) # Make N requests. @@ -166,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool): # Add requests to the detokenizer. for request in requests: - detokenizer.add_request(request) + output_processor.add_request(request) gen_strings = {} aborted = [] @@ -177,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool): break # Step the Detokenizer. - request_outputs, requests_to_abort = detokenizer.step(outputs) + processed_outputs = output_processor.process_outputs(outputs) + request_outputs = processed_outputs.request_outputs + requests_to_abort = processed_outputs.reqs_to_abort for request_output in request_outputs: # If aborted, we should not get a request output. assert request_output.request_id not in aborted @@ -218,5 +225,5 @@ def test_stop_string(include_stop_str_in_output: bool): assert gen_str == ref_str_exc_stop, ( f"{gen_str=}, {ref_str_exc_stop=}") - assert detokenizer.get_num_unfinished_requests() == 0 - assert not detokenizer.has_unfinished_requests() + assert output_processor.get_num_unfinished_requests() == 0 + assert not output_processor.has_unfinished_requests() From 5f37918273d27f84fdd5ce52ff3a382795234968 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 23:42:00 +0000 Subject: [PATCH 55/67] reduce LOC changes --- tests/v1/engine/test_output_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 3fcd796a5ba4e..5d82682fa1e18 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -16,7 +16,6 @@ VLLM_CONFIG.parallel_config, VLLM_CONFIG.lora_config) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) - FULL_STRINGS = [ "My name is Robert from Neural Magic and I love working on vLLM so much!", "Red Hat is the best open source company by far across Linux, K8s, and AI.", From 1d9b2337894b6724167e80cb90db073b751bd956 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 12 Jan 2025 23:50:57 +0000 Subject: [PATCH 56/67] updated --- tests/v1/engine/test_output_processor.py | 31 ++++++++++++++++++------ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 5d82682fa1e18..8d92bc4508029 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -142,7 +142,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): @pytest.mark.parametrize("include_stop_str_in_output", [True, False]) def test_stop_string(include_stop_str_in_output: bool): - output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True) engine_core = MockEngineCore(GENERATION_TOKENS) # Make N requests. @@ -157,13 +157,7 @@ def test_stop_string(include_stop_str_in_output: bool): mm_placeholders=None, eos_token_id=None, lora_request=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=STOP_STRINGS, - include_stop_str_in_output=include_stop_str_in_output, - )) for idx, ( + sampling_params=SamplingParams()) for idx, ( prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] @@ -226,3 +220,24 @@ def test_stop_string(include_stop_str_in_output: bool): assert output_processor.get_num_unfinished_requests() == 0 assert not output_processor.has_unfinished_requests() + + +def test_iteration_stats(): + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + EngineCoreRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams() + ) for idx, (prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] \ No newline at end of file From 288096214299fc91bddc5a07843a6aa4c701a6af Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 01:26:43 +0000 Subject: [PATCH 57/67] added IterationStats test --- tests/v1/engine/test_output_processor.py | 54 +++++++++++++++++++++--- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 8d92bc4508029..2702886125b30 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -142,7 +142,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): @pytest.mark.parametrize("include_stop_str_in_output", [True, False]) def test_stop_string(include_stop_str_in_output: bool): - output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True) + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) engine_core = MockEngineCore(GENERATION_TOKENS) # Make N requests. @@ -223,7 +223,7 @@ def test_stop_string(include_stop_str_in_output: bool): def test_iteration_stats(): - output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False) + output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True) engine_core = MockEngineCore(GENERATION_TOKENS) # Make N requests. @@ -238,6 +238,50 @@ def test_iteration_stats(): mm_placeholders=None, eos_token_id=None, lora_request=None, - sampling_params=SamplingParams() - ) for idx, (prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) - ] \ No newline at end of file + sampling_params=SamplingParams(), + ) for idx, ( + prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add all requests except one to the OutputProcessor. + num_active = len(GENERATION_TOKENS) - 1 + for request in requests[:num_active]: + output_processor.add_request(request) + inactive_request = requests[num_active] + + # First iteration has 2 prefills. + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + total_prompt_tokens = sum( + [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]]) + + assert iteration_stats.num_prompt_tokens == total_prompt_tokens + assert iteration_stats.num_generation_tokens == num_active + + # Just decodes in this step. + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + + assert iteration_stats.num_prompt_tokens == 0 + assert iteration_stats.num_generation_tokens == num_active + + # Add a new requrest - prefill and 2 decodes in this step. + output_processor.add_request(inactive_request) + num_active += 1 + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1]) + + assert iteration_stats.num_prompt_tokens == total_prompt_tokens + assert iteration_stats.num_generation_tokens == num_active + + # Just decodes in this step. + outputs = engine_core.get_outputs()[:num_active] + processed_outputs = output_processor.process_outputs(outputs) + iteration_stats = processed_outputs.iteration_stats + + assert iteration_stats.num_prompt_tokens == 0 + assert iteration_stats.num_generation_tokens == num_active From 7de7c00113e0d89442c9ab18e7004b364de043aa Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 01:28:22 +0000 Subject: [PATCH 58/67] codespell --- tests/v1/engine/test_output_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 2702886125b30..912bac9513c86 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -267,7 +267,7 @@ def test_iteration_stats(): assert iteration_stats.num_prompt_tokens == 0 assert iteration_stats.num_generation_tokens == num_active - # Add a new requrest - prefill and 2 decodes in this step. + # Add a new request - prefill and 2 decodes in this step. output_processor.add_request(inactive_request) num_active += 1 outputs = engine_core.get_outputs()[:num_active] From eec573cd48e9d06973b00a9cfcbe36727269b132 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 01:33:53 +0000 Subject: [PATCH 59/67] add comment about invairant --- vllm/v1/metrics/stats.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index ac94b32fdd5f6..381ee1dd29251 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -31,4 +31,9 @@ def update_from_output(self, output: "EngineCoreOutput", self.num_generation_tokens += len(output.new_token_ids) if is_prefilling: + # This relies on the invariant that EngineCore does + # not stream outputs for partially completed prefills + # (scheduler.update_from_output makes EngineCoreOutput + # iff num_computed_tokens == num_tokens). + assert(output.new_token_ids > 1) self.num_prompt_tokens += prompt_len From 0427e03a5b782da4fb6bc5bfebfc3420b7630e90 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 01:44:18 +0000 Subject: [PATCH 60/67] updated --- tests/v1/engine/test_async_llm.py | 53 +++++++++++++++++++++++++++++++ vllm/v1/metrics/stats.py | 2 +- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index fffb5b8100ec7..7d244c2f8e40e 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -66,4 +66,57 @@ async def test_load(monkeypatch): f"{failed_request_id} generated {tokens} but " f"expected {NUM_EXPECTED_TOKENS}") + # Make sure RequestStates get cleaned up. + assert not engine.output_processor.has_unfinished_requests() engine.shutdown() + + +@pytest.mark.asyncio +async def test_abort(monkeypatch): + # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 + # so that in the future when we switch, we don't have to change all the + # tests. + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + + NUM_REQUESTS = 100 + NUM_EXPECTED_TOKENS = 100 + REQUEST_IDS_TO_ABORT = [1 + idx * 10 for idx in range(10)] + + request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] + + # Create concurrent requests. + tasks = [] + for request_id in request_ids: + tasks.append( + asyncio.create_task( + generate(engine, request_id, NUM_EXPECTED_TOKENS))) + + # API server cancels requests when they are aborted. + for idx in REQUEST_IDS_TO_ABORT: + tasks[idx].cancel() + + # Confirm the other requests are okay. + failed_request_id = None + tokens = None + for idx, task in enumerate(tasks): + + # Confirm that it was actually canceled. + if idx in REQUEST_IDS_TO_ABORT: + with pytest.raises(asyncio.CancelledError): + await task + + # Otherwise, make sure the request was not impacted. + num_generated_tokens, request_id = await task + if (num_generated_tokens != NUM_EXPECTED_TOKENS + and failed_request_id is None): + failed_request_id = request_id + tokens = num_generated_tokens + + assert failed_request_id is None, ( + f"{failed_request_id} generated {tokens} but " + f"expected {NUM_EXPECTED_TOKENS}") + + assert not engine.output_processor.has_unfinished_requests() \ No newline at end of file diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 381ee1dd29251..bd5fbc922f679 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -35,5 +35,5 @@ def update_from_output(self, output: "EngineCoreOutput", # not stream outputs for partially completed prefills # (scheduler.update_from_output makes EngineCoreOutput # iff num_computed_tokens == num_tokens). - assert(output.new_token_ids > 1) + assert(len(output.new_token_ids) > 1) self.num_prompt_tokens += prompt_len From 9b49133885b6760ce056a65e937fe2bad2dcf24d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 01:46:56 +0000 Subject: [PATCH 61/67] tweak --- tests/v1/engine/test_async_llm.py | 12 ++++++++++-- vllm/v1/metrics/stats.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 7d244c2f8e40e..ef2f947d4a963 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -66,7 +66,6 @@ async def test_load(monkeypatch): f"{failed_request_id} generated {tokens} but " f"expected {NUM_EXPECTED_TOKENS}") - # Make sure RequestStates get cleaned up. assert not engine.output_processor.has_unfinished_requests() engine.shutdown() @@ -119,4 +118,13 @@ async def test_abort(monkeypatch): f"{failed_request_id} generated {tokens} but " f"expected {NUM_EXPECTED_TOKENS}") - assert not engine.output_processor.has_unfinished_requests() \ No newline at end of file + assert not engine.output_processor.has_unfinished_requests() + + # Confirm we can do another generation. + task = asyncio.create_task(generate( + engine, REQUEST_IDS_TO_ABORT[0], NUM_EXPECTED_TOKENS)) + num_generated_tokens, request_id = await task + assert num_generated_tokens == NUM_EXPECTED_TOKENS + assert not engine.output_processor.has_unfinished_requests() + + engine.shutdown() diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index bd5fbc922f679..ddd220c4b9be6 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -35,5 +35,5 @@ def update_from_output(self, output: "EngineCoreOutput", # not stream outputs for partially completed prefills # (scheduler.update_from_output makes EngineCoreOutput # iff num_computed_tokens == num_tokens). - assert(len(output.new_token_ids) > 1) + assert(len(output.new_token_ids) > 0) self.num_prompt_tokens += prompt_len From bffa5d011fd5474fab0afb8772bcd3f66e838737 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 02:06:26 +0000 Subject: [PATCH 62/67] formatting and added test --- tests/v1/engine/test_async_llm.py | 52 +++++++++++-------------------- vllm/v1/metrics/stats.py | 2 +- 2 files changed, 20 insertions(+), 34 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index ef2f947d4a963..6764ee799abb0 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,5 +1,5 @@ import asyncio -from typing import Tuple +from typing import List, Tuple import pytest @@ -13,6 +13,7 @@ allow_module_level=True) ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", + enforce_eager=True, disable_log_requests=True) @@ -53,18 +54,11 @@ async def test_load(monkeypatch): generate(engine, request_id, NUM_EXPECTED_TOKENS))) # Confirm that we got all the EXPECTED tokens from the requests. - failed_request_id = None - tokens = None for task in tasks: num_generated_tokens, request_id = await task - if (num_generated_tokens != NUM_EXPECTED_TOKENS - and failed_request_id is None): - failed_request_id = request_id - tokens = num_generated_tokens - - assert failed_request_id is None, ( - f"{failed_request_id} generated {tokens} but " - f"expected {NUM_EXPECTED_TOKENS}") + assert num_generated_tokens == NUM_EXPECTED_TOKENS, ( + f"{request_id} generated {num_generated_tokens} but " + f"expected {NUM_EXPECTED_TOKENS}") assert not engine.output_processor.has_unfinished_requests() engine.shutdown() @@ -72,9 +66,7 @@ async def test_load(monkeypatch): @pytest.mark.asyncio async def test_abort(monkeypatch): - # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 - # so that in the future when we switch, we don't have to change all the - # tests. + with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -82,47 +74,41 @@ async def test_abort(monkeypatch): NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 100 - REQUEST_IDS_TO_ABORT = [1 + idx * 10 for idx in range(10)] + REQUEST_IDS_TO_ABORT = range(1, 100, 10) request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] # Create concurrent requests. - tasks = [] + tasks: List[asyncio.Task] = [] for request_id in request_ids: tasks.append( asyncio.create_task( generate(engine, request_id, NUM_EXPECTED_TOKENS))) - + # API server cancels requests when they are aborted. for idx in REQUEST_IDS_TO_ABORT: tasks[idx].cancel() + await asyncio.sleep(0.05) # Confirm the other requests are okay. - failed_request_id = None - tokens = None for idx, task in enumerate(tasks): - # Confirm that it was actually canceled. if idx in REQUEST_IDS_TO_ABORT: with pytest.raises(asyncio.CancelledError): await task - - # Otherwise, make sure the request was not impacted. - num_generated_tokens, request_id = await task - if (num_generated_tokens != NUM_EXPECTED_TOKENS - and failed_request_id is None): - failed_request_id = request_id - tokens = num_generated_tokens - - assert failed_request_id is None, ( - f"{failed_request_id} generated {tokens} but " - f"expected {NUM_EXPECTED_TOKENS}") + else: + # Otherwise, make sure the request was not impacted. + num_generated_tokens, request_id = await task + assert num_generated_tokens == NUM_EXPECTED_TOKENS, ( + f"{request_id} generated {num_generated_tokens} but " + f"expected {NUM_EXPECTED_TOKENS}") assert not engine.output_processor.has_unfinished_requests() # Confirm we can do another generation. - task = asyncio.create_task(generate( - engine, REQUEST_IDS_TO_ABORT[0], NUM_EXPECTED_TOKENS)) + request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}" + task = asyncio.create_task( + generate(engine, request_id, NUM_EXPECTED_TOKENS)) num_generated_tokens, request_id = await task assert num_generated_tokens == NUM_EXPECTED_TOKENS assert not engine.output_processor.has_unfinished_requests() diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index ddd220c4b9be6..60cb986f8bbce 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -35,5 +35,5 @@ def update_from_output(self, output: "EngineCoreOutput", # not stream outputs for partially completed prefills # (scheduler.update_from_output makes EngineCoreOutput # iff num_computed_tokens == num_tokens). - assert(len(output.new_token_ids) > 0) + assert (len(output.new_token_ids) > 0) self.num_prompt_tokens += prompt_len From 605c5f0d8f8a0f65f2fd5d81a66badfc7975cbdb Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 02:07:35 +0000 Subject: [PATCH 63/67] passing --- tests/v1/engine/test_async_llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 6764ee799abb0..2c805e18eebae 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -85,10 +85,10 @@ async def test_abort(monkeypatch): asyncio.create_task( generate(engine, request_id, NUM_EXPECTED_TOKENS))) - # API server cancels requests when they are aborted. + # API server cancels requests when they disconnect. for idx in REQUEST_IDS_TO_ABORT: tasks[idx].cancel() - await asyncio.sleep(0.05) + await asyncio.sleep(0.1) # Confirm the other requests are okay. for idx, task in enumerate(tasks): From d0013a48ae5a1d9d1c43f319323793ee11528920 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 02:09:30 +0000 Subject: [PATCH 64/67] ruff ruff --- tests/v1/engine/test_output_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 912bac9513c86..920b12459d93d 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -239,8 +239,8 @@ def test_iteration_stats(): eos_token_id=None, lora_request=None, sampling_params=SamplingParams(), - ) for idx, ( - prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ) for idx, (prompt, prompt_tokens) in enumerate( + zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] # Add all requests except one to the OutputProcessor. From e01d236f7acd468a7a3c79abb12c057cdc789747 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 02:27:09 +0000 Subject: [PATCH 65/67] format --- tests/v1/engine/test_output_processor.py | 40 +++++++++++++----------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 920b12459d93d..c04e170a9f433 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -11,9 +11,10 @@ TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config() -TOKENIZER_GROUP = init_tokenizer_from_configs( - VLLM_CONFIG.model_config, VLLM_CONFIG.scheduler_config, - VLLM_CONFIG.parallel_config, VLLM_CONFIG.lora_config) +TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config, + VLLM_CONFIG.scheduler_config, + VLLM_CONFIG.parallel_config, + VLLM_CONFIG.lora_config) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) FULL_STRINGS = [ @@ -110,7 +111,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): break # Step the Detokenizer. - processed_outputs = output_processor.process_outputs(outputs,) + processed_outputs = output_processor.process_outputs(outputs, ) request_outputs = processed_outputs.request_outputs requests_to_abort = processed_outputs.reqs_to_abort assert len(requests_to_abort) == 0 @@ -147,19 +148,19 @@ def test_stop_string(include_stop_str_in_output: bool): # Make N requests. requests = [ - EngineCoreRequest( - request_id=f"request-{idx}", - prompt=prompt, - prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_inputs=None, - mm_hashes=None, - mm_placeholders=None, - eos_token_id=None, - lora_request=None, - sampling_params=SamplingParams()) for idx, ( - prompt, - prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + EngineCoreRequest(request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams()) + for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] # Add requests to the detokenizer. @@ -239,8 +240,9 @@ def test_iteration_stats(): eos_token_id=None, lora_request=None, sampling_params=SamplingParams(), - ) for idx, (prompt, prompt_tokens) in enumerate( - zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] # Add all requests except one to the OutputProcessor. From a53f089c11925adaa215d17fb83e63b1b4418e83 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 02:36:12 +0000 Subject: [PATCH 66/67] run isort --- tests/v1/engine/test_output_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index c04e170a9f433..57e3e2a649839 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -4,8 +4,8 @@ from transformers import AutoTokenizer from vllm.engine.arg_utils import EngineArgs -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest from vllm.v1.engine.output_processor import OutputProcessor From 3e45fc66144f9a5088cb605e2d1b96558e815a98 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 13 Jan 2025 03:00:04 +0000 Subject: [PATCH 67/67] undo fat finger Signed-off-by: rshaw@neuralmagic.com --- tests/v1/engine/test_output_processor.py | 32 ++++++++++++++---------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 57e3e2a649839..4735c6f947537 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -148,19 +148,25 @@ def test_stop_string(include_stop_str_in_output: bool): # Make N requests. requests = [ - EngineCoreRequest(request_id=f"request-{idx}", - prompt=prompt, - prompt_token_ids=prompt_tokens, - arrival_time=0, - mm_inputs=None, - mm_hashes=None, - mm_placeholders=None, - eos_token_id=None, - lora_request=None, - sampling_params=SamplingParams()) - for idx, ( - prompt, - prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + EngineCoreRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=STOP_STRINGS, + include_stop_str_in_output=include_stop_str_in_output, + )) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] # Add requests to the detokenizer.