From cfa8c2bfd6fd5836f0c8a3750131890a54834fb2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 22:06:38 +0000
Subject: [PATCH 01/67] added code

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 benchmarks/benchmark_serving.py               |  2 +-
 .../openai_chat_completion_client.py          |  2 +-
 vllm/v1/core/scheduler.py                     | 21 ++++++---
 vllm/v1/engine/__init__.py                    |  3 ++
 vllm/v1/engine/async_llm.py                   | 45 ++++++++++--------
 vllm/v1/engine/core.py                        | 47 +++++--------------
 vllm/v1/engine/core_client.py                 | 40 +++++-----------
 vllm/v1/engine/llm_engine.py                  |  6 +--
 vllm/v1/metrics/__init__.py                   |  0
 vllm/v1/metrics/loggers.py                    | 39 +++++++++++++++
 vllm/v1/metrics/stats.py                      | 20 ++++++++
 11 files changed, 134 insertions(+), 91 deletions(-)
 create mode 100644 vllm/v1/metrics/__init__.py
 create mode 100644 vllm/v1/metrics/loggers.py
 create mode 100644 vllm/v1/metrics/stats.py

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 4eb0e1f8ac903..7698e7f50120c 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -791,7 +791,7 @@ def main(args: argparse.Namespace):
         base_url = f"http://{args.host}:{args.port}"
 
     tokenizer = get_tokenizer(tokenizer_id,
-                              tokenizer_mode=tokenizer_mode,
+                            #   tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
 
     if args.dataset is not None:
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index bbada3891bd19..a7925f345709a 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -2,7 +2,7 @@
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+openai_api_base = "http://localhost:8001/v1"
 
 client = OpenAI(
     # defaults to os.environ.get("OPENAI_API_KEY")
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b26716f5c02e6..6d280a53802dd 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -8,7 +8,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-from vllm.v1.engine import EngineCoreOutput
+from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
+from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
@@ -394,12 +395,12 @@ def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
+    ) -> EngineCoreOutputs:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
-        engine_core_outputs: List[EngineCoreOutput] = []
+        outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -438,7 +439,7 @@ def update_from_output(
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
                     stop_reason=request.stop_reason)
-                engine_core_outputs.append(output)
+                outputs.append(output)
 
                 # Breakout of the loop.
                 if stopped:
@@ -446,7 +447,10 @@ def update_from_output(
 
             new_running.append(request)
         self.running = new_running
-        return engine_core_outputs
+        return EngineCoreOutputs(
+            outputs=outputs,
+            scheduler_stats=self.make_stats(),
+        )
 
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
@@ -514,7 +518,12 @@ def get_num_unfinished_requests(self) -> int:
 
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
-
+    
+    def make_stats(self) -> SchedulerStats:
+        return SchedulerStats(
+            num_running_reqs=len(self.running),
+            num_waiting_reqs=len(self.waiting),
+        )
 
 @dataclass
 class NewRequestData:
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 5e3c5e327ef63..4fb2a310064f5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -4,6 +4,8 @@
 
 import msgspec
 
+from vllm.v1.metrics.stats import SchedulerStats
+
 if TYPE_CHECKING:
     from vllm.lora.request import LoRARequest
     from vllm.multimodal import MultiModalKwargs
@@ -56,6 +58,7 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
+    scheduler_stats: SchedulerStats
 
 
 @dataclass
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5daae45dee85c..33955a8c4acb9 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,9 +4,8 @@
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
-from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs import INPUT_REGISTRY, PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -21,6 +20,9 @@
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
+from vllm.v1.metrics.loggers import StatLoggerBase
+from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.loggers import LoggingStatLogger
 from vllm.v1.executor.abstract import Executor
 
 logger = init_logger(__name__)
@@ -33,19 +35,16 @@ def __init__(
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
         log_stats: bool,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-        input_registry: InputRegistry = INPUT_REGISTRY,
-        use_cached_outputs: bool = False,
-        log_requests: bool = True,
-        start_engine_loop: bool = True,
+        log_requests: bool,
     ) -> None:
 
-        assert start_engine_loop
-
+        # Logging.
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers = stat_loggers
+        self.stat_loggers: List[StatLoggerBase] = [
+            LoggingStatLogger(),
+            # PrometheusStatLogger(),
+        ]
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -65,7 +64,7 @@ def __init__(
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
             tokenizer=self.tokenizer,
-            input_registry=input_registry,
+            input_registry=INPUT_REGISTRY,
         )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
@@ -82,7 +81,6 @@ def __init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
@@ -92,9 +90,7 @@ def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
         engine_config: Optional[VllmConfig] = None,
-        start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
@@ -112,9 +108,6 @@ def from_engine_args(
             executor_class=executor_class,
             log_requests=not engine_args.disable_log_requests,
             log_stats=not engine_args.disable_log_stats,
-            start_engine_loop=start_engine_loop,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
         )
 
     def shutdown(self):
@@ -254,7 +247,7 @@ async def _run_output_handler(self):
                 outputs = await self.engine_core.get_output_async()
 
                 # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
+                request_outputs, reqs_to_abort = self.detokenizer.step(outputs.outputs)
 
                 # 3) Put the RequestOutputs into the per-request queues.
                 self._process_request_outputs(request_outputs)
@@ -262,6 +255,11 @@ async def _run_output_handler(self):
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
+                # 5) Log any stats.
+                await self._log_stats(
+                    scheduler_stats=outputs.scheduler_stats
+                )
+
         except Exception as e:
             logger.exception("EngineCore output handler hit an error: %s", e)
             kill_process_tree(os.getpid())
@@ -278,6 +276,15 @@ async def abort(self, request_id: str) -> None:
         if request_id in self.rid_to_queue:
             del self.rid_to_queue[request_id]
 
+
+    async def _log_stats(self, scheduler_stats: SchedulerStats):
+        """Log stats to the stat loggers."""
+        if not self.log_stats:
+            return
+        
+        for logger in self.stat_loggers:
+            logger.log(scheduler_stats=scheduler_stats)
+
     def encode(
         self,
         prompt: PromptType,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 975ce11fe8aff..6949ea129a2ae 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -19,7 +19,8 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+                            EngineCoreRequestType, EngineCoreRequestUnion,
+                            SchedulerStats)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -28,9 +29,8 @@
 
 logger = init_logger(__name__)
 
-POLLING_TIMEOUT_MS = 5000
+POLLING_TIMEOUT_MS = 2500
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
-LOGGING_TIME_S = 5
 
 
 class EngineCore:
@@ -40,10 +40,8 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
-        self.log_stats = log_stats
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -62,8 +60,6 @@ def __init__(
                                    vllm_config.cache_config,
                                    vllm_config.lora_config)
 
-        self._last_logging_time = time.time()
-
         self.mm_input_mapper_server = MMInputMapperServer(
             vllm_config.model_config)
 
@@ -114,7 +110,7 @@ def abort_requests(self, request_ids: List[str]):
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
-    def step(self) -> List[EngineCoreOutput]:
+    def step(self) -> EngineCoreOutputs:
         """Schedule, execute, and make output."""
 
         if not self.scheduler.has_unfinished_requests():
@@ -143,9 +139,8 @@ def __init__(
         ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        super().__init__(vllm_config, executor_class)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -153,7 +148,7 @@ def __init__(
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
         self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
-        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
+        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
                          daemon=True).start()
@@ -217,7 +212,10 @@ def run_busy_loop(self):
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
-                        self._log_stats()
+                        # Push out most recent scheduler stats to client.
+                        stats = self.scheduler.make_stats()
+                        self.output_queue.put_nowait(EngineCoreOutputs(
+                            outputs=[], scheduler_stats=stats))
                         logger.debug("EngineCore busy loop waiting.")
                     except BaseException:
                         raise
@@ -230,27 +228,9 @@ def run_busy_loop(self):
             # 3) Step the engine core.
             outputs = self.step()
 
-            # 4) Put EngineCoreOutputs into the output queue.
+            # 5) Put EngineCoreOutputs into the output queue.
             self.output_queue.put_nowait(outputs)
-
-            self._log_stats()
-
-    def _log_stats(self):
-        """Log basic stats every LOGGING_TIME_S"""
-
-        if not self.log_stats:
-            return
-
-        now = time.time()
-
-        if now - self._last_logging_time > LOGGING_TIME_S:
-            logger.info(
-                "RUNNING: %s | WAITING: %s",
-                len(self.scheduler.running),
-                len(self.scheduler.waiting),
-            )
-
-            self._last_logging_time = now
+        
 
     def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
@@ -301,7 +281,6 @@ def process_output_socket(self, output_path: str):
 
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
-                engine_core_outputs = self.output_queue.get()
-                outputs = EngineCoreOutputs(outputs=engine_core_outputs)
+                outputs = self.output_queue.get()
                 encoder.encode_into(outputs, buffer)
                 socket.send_multipart((buffer, ), copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a4a45ae05ff9e..bb51a52d52dc8 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,7 +12,7 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         make_zmq_socket)
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+from vllm.v1.engine import (EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
@@ -40,7 +40,6 @@ def make_client(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -50,18 +49,18 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(vllm_config, executor_class, log_stats)
+            return AsyncMPClient(vllm_config, executor_class)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(vllm_config, executor_class, log_stats)
+            return SyncMPClient(vllm_config, executor_class)
 
-        return InprocClient(vllm_config, executor_class, log_stats)
+        return InprocClient(vllm_config, executor_class)
 
     @abstractmethod
     def shutdown(self):
         ...
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
     def add_request(self, request: EngineCoreRequest) -> None:
@@ -73,7 +72,7 @@ def profile(self, is_start: bool = True) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
+    async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
@@ -99,7 +98,7 @@ class InprocClient(EngineCoreClient):
     def __init__(self, *args, **kwargs):
         self.engine_core = EngineCore(*args, **kwargs)
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
         return self.engine_core.step()
 
     def add_request(self, request: EngineCoreRequest) -> None:
@@ -133,7 +132,6 @@ def __init__(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
     ):
         # The child processes will send SIGUSR1 when unrecoverable
         # errors happen. We kill the process tree here so that the
@@ -180,7 +178,6 @@ def sigusr1_handler(signum, frame):
             process_kwargs={
                 "vllm_config": vllm_config,
                 "executor_class": executor_class,
-                "log_stats": log_stats,
             })
 
     def shutdown(self):
@@ -194,22 +191,17 @@ def shutdown(self):
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: Type[Executor],
-                 log_stats: bool = False):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=log_stats,
         )
 
-    def get_output(self) -> List[EngineCoreOutput]:
+    def get_output(self) -> EngineCoreOutputs:
 
         (frame, ) = self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frame.buffer).outputs
-        return engine_core_outputs
+        return self.decoder.decode(frame.buffer)
 
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: EngineCoreRequestUnion) -> None:
@@ -232,23 +224,17 @@ def profile(self, is_start: bool = True) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 executor_class: Type[Executor],
-                 log_stats: bool = False):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=log_stats,
         )
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
+    async def get_output_async(self) -> EngineCoreOutputs:
 
         frames = await self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
-
-        return engine_core_outputs
+        return self.decoder.decode(frames[0].buffer)
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8ced3a34d2da3..7a75623aa5bf1 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -42,6 +42,7 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
+        assert log_stats is False
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -74,7 +75,6 @@ def __init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,
         )
 
     @classmethod
@@ -147,11 +147,11 @@ def add_request(
     def step(self) -> List[RequestOutput]:
 
         # 1) Get EngineCoreOutput from the EngineCore.
-        engine_core_outputs = self.engine_core.get_output()
+        outputs = self.engine_core.get_output()
 
         # 2) Detokenizer the EngineCoreOutput.
         request_outputs, requests_to_abort = self.detokenizer.step(
-            engine_core_outputs)
+            outputs.outputs)
 
         # 3) Abort requests that finished due to stopping criteria.
         if requests_to_abort:
diff --git a/vllm/v1/metrics/__init__.py b/vllm/v1/metrics/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
new file mode 100644
index 0000000000000..660823dc402a5
--- /dev/null
+++ b/vllm/v1/metrics/loggers.py
@@ -0,0 +1,39 @@
+from abc import ABC, abstractmethod
+import time
+
+from vllm.logger import init_logger
+from vllm.v1.metrics.stats import SchedulerStats
+
+logger = init_logger(__name__)
+
+_LOCAL_LOGGING_INTERVAL_SEC = 5.0
+
+
+class StatLoggerBase(ABC):
+    
+    @abstractmethod
+    def log(self, scheduler_stats: SchedulerStats):
+        ...
+
+
+class LoggingStatLogger(StatLoggerBase):
+
+    def __init__(self):
+        self.last_log_time = time.monotonic()
+
+    def log(self, scheduler_stats: SchedulerStats):
+        """Log Stats to standard output."""
+
+        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        now = time.monotonic()
+        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
+            return
+        self.last_log_time = now
+
+        # Format and print output.
+        logger.info(
+            "Running: %d reqs, Waiting: %d reqs ",
+            scheduler_stats.num_running_reqs,
+            scheduler_stats.num_waiting_reqs,
+        )
+        
\ No newline at end of file
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
new file mode 100644
index 0000000000000..6690a4aa6252f
--- /dev/null
+++ b/vllm/v1/metrics/stats.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+
+@dataclass
+class SchedulerStats:
+    """Stats associated with the scheduler."""
+
+    num_running_reqs: int = 0
+    num_waiting_reqs: int = 0
+
+    # gpu_cache_usage: float = 0.0
+    # gpu_prefix_cache_hit_rate: float = 0.0
+
+@dataclass
+class RequestStats:
+    """Stats associated with a request."""
+    pass
+
+@dataclass
+class Stats:
+    scheduler: SchedulerStats
\ No newline at end of file

From 6d8e4f300b5a5f37b050fd5105d53a9848d400d4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 22:09:09 +0000
Subject: [PATCH 02/67] fixed

---
 benchmarks/benchmark_serving.py | 2 +-
 vllm/v1/metrics/stats.py        | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 7698e7f50120c..4eb0e1f8ac903 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -791,7 +791,7 @@ def main(args: argparse.Namespace):
         base_url = f"http://{args.host}:{args.port}"
 
     tokenizer = get_tokenizer(tokenizer_id,
-                            #   tokenizer_mode=tokenizer_mode,
+                              tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
 
     if args.dataset is not None:
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 6690a4aa6252f..c456cc6915692 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -9,12 +9,3 @@ class SchedulerStats:
 
     # gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
-
-@dataclass
-class RequestStats:
-    """Stats associated with a request."""
-    pass
-
-@dataclass
-class Stats:
-    scheduler: SchedulerStats
\ No newline at end of file

From c78a56f92bbbbcdfbc632b5c5458993cbbfec2e3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 22:09:44 +0000
Subject: [PATCH 03/67] fixed

---
 examples/online_serving/openai_chat_completion_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index a7925f345709a..bbada3891bd19 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -2,7 +2,7 @@
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8001/v1"
+openai_api_base = "http://localhost:8000/v1"
 
 client = OpenAI(
     # defaults to os.environ.get("OPENAI_API_KEY")

From 7b397057ec58fb37419886f974d7c6deefba4036 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 22:25:34 +0000
Subject: [PATCH 04/67] updated

---
 vllm/v1/core/scheduler.py     |  3 ++-
 vllm/v1/engine/async_llm.py   | 15 ++++++---------
 vllm/v1/engine/core.py        | 21 +++++++++++----------
 vllm/v1/engine/core_client.py | 16 +++++++++++-----
 vllm/v1/metrics/loggers.py    |  5 ++---
 vllm/v1/metrics/stats.py      |  1 +
 6 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 6d280a53802dd..f04e529891287 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -518,13 +518,14 @@ def get_num_unfinished_requests(self) -> int:
 
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
-    
+
     def make_stats(self) -> SchedulerStats:
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
         )
 
+
 @dataclass
 class NewRequestData:
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 33955a8c4acb9..7cf05ca9bed45 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -20,10 +20,9 @@
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
-from vllm.v1.metrics.loggers import StatLoggerBase
-from vllm.v1.metrics.stats import SchedulerStats
-from vllm.v1.metrics.loggers import LoggingStatLogger
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
+from vllm.v1.metrics.stats import SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -247,7 +246,8 @@ async def _run_output_handler(self):
                 outputs = await self.engine_core.get_output_async()
 
                 # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(outputs.outputs)
+                request_outputs, reqs_to_abort = self.detokenizer.step(
+                    outputs.outputs)
 
                 # 3) Put the RequestOutputs into the per-request queues.
                 self._process_request_outputs(request_outputs)
@@ -256,9 +256,7 @@ async def _run_output_handler(self):
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
                 # 5) Log any stats.
-                await self._log_stats(
-                    scheduler_stats=outputs.scheduler_stats
-                )
+                await self._log_stats(scheduler_stats=outputs.scheduler_stats)
 
         except Exception as e:
             logger.exception("EngineCore output handler hit an error: %s", e)
@@ -276,12 +274,11 @@ async def abort(self, request_id: str) -> None:
         if request_id in self.rid_to_queue:
             del self.rid_to_queue[request_id]
 
-
     async def _log_stats(self, scheduler_stats: SchedulerStats):
         """Log stats to the stat loggers."""
         if not self.log_stats:
             return
-        
+
         for logger in self.stat_loggers:
             logger.log(scheduler_stats=scheduler_stats)
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6949ea129a2ae..84dbe248e4d53 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -17,10 +17,9 @@
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.scheduler import Scheduler
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion,
-                            SchedulerStats)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -114,7 +113,8 @@ def step(self) -> EngineCoreOutputs:
         """Schedule, execute, and make output."""
 
         if not self.scheduler.has_unfinished_requests():
-            return []
+            return EngineCoreOutputs(
+                outputs=[], scheduler_stats=self.scheduler.make_stats())
 
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
@@ -139,9 +139,12 @@ def __init__(
         ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool = False,
     ):
         super().__init__(vllm_config, executor_class)
 
+        self.log_stats = log_stats
+
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
@@ -212,10 +215,9 @@ def run_busy_loop(self):
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
-                        # Push out most recent scheduler stats to client.
-                        stats = self.scheduler.make_stats()
-                        self.output_queue.put_nowait(EngineCoreOutputs(
-                            outputs=[], scheduler_stats=stats))
+                        # Break out the loops so we can log_stats via step().
+                        if self.log_stats:
+                            break
                         logger.debug("EngineCore busy loop waiting.")
                     except BaseException:
                         raise
@@ -230,7 +232,6 @@ def run_busy_loop(self):
 
             # 5) Put EngineCoreOutputs into the output queue.
             self.output_queue.put_nowait(outputs)
-        
 
     def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
         """Handle EngineCoreRequest or EngineCoreABORT from Client."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index bb51a52d52dc8..176fa839c8f58 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,9 +12,9 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         make_zmq_socket)
-from vllm.v1.engine import (EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
@@ -132,6 +132,7 @@ def __init__(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool,
     ):
         # The child processes will send SIGUSR1 when unrecoverable
         # errors happen. We kill the process tree here so that the
@@ -178,6 +179,7 @@ def sigusr1_handler(signum, frame):
             process_kwargs={
                 "vllm_config": vllm_config,
                 "executor_class": executor_class,
+                "log_stats": log_stats,
             })
 
     def shutdown(self):
@@ -191,11 +193,13 @@ def shutdown(self):
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig,
+                 executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=False,
         )
 
     def get_output(self) -> EngineCoreOutputs:
@@ -224,11 +228,13 @@ def profile(self, is_start: bool = True) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig,
+                 executor_class: Type[Executor]):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=True,
         )
 
     async def get_output_async(self) -> EngineCoreOutputs:
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 660823dc402a5..8feeef17542e6 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,5 +1,5 @@
-from abc import ABC, abstractmethod
 import time
+from abc import ABC, abstractmethod
 
 from vllm.logger import init_logger
 from vllm.v1.metrics.stats import SchedulerStats
@@ -10,7 +10,7 @@
 
 
 class StatLoggerBase(ABC):
-    
+
     @abstractmethod
     def log(self, scheduler_stats: SchedulerStats):
         ...
@@ -36,4 +36,3 @@ def log(self, scheduler_stats: SchedulerStats):
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
         )
-        
\ No newline at end of file
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index c456cc6915692..5ebb4fd5b37db 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 
+
 @dataclass
 class SchedulerStats:
     """Stats associated with the scheduler."""

From 6e9cd1cd9177c1ba00e4e207ff9b8991b233b4d9 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 22:28:41 +0000
Subject: [PATCH 05/67] updated

---
 vllm/v1/engine/async_llm.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7cf05ca9bed45..b109e5e34ea7b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -5,7 +5,7 @@
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
-from vllm.inputs import INPUT_REGISTRY, PromptType
+from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -34,7 +34,11 @@ def __init__(
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
         log_stats: bool,
-        log_requests: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        input_registry: InputRegistry = INPUT_REGISTRY,
+        use_cached_outputs: bool = False,
+        log_requests: bool = True,
+        start_engine_loop: bool = True,
     ) -> None:
 
         # Logging.
@@ -63,7 +67,7 @@ def __init__(
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
             tokenizer=self.tokenizer,
-            input_registry=INPUT_REGISTRY,
+            input_registry=input_registry,
         )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
@@ -89,6 +93,7 @@ def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
         engine_config: Optional[VllmConfig] = None,
+        start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
@@ -107,6 +112,8 @@ def from_engine_args(
             executor_class=executor_class,
             log_requests=not engine_args.disable_log_requests,
             log_stats=not engine_args.disable_log_stats,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
         )
 
     def shutdown(self):

From 2657b7f96dec516622a48e0d3b369b6920191624 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 22:29:09 +0000
Subject: [PATCH 06/67] fixed

---
 vllm/v1/engine/async_llm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b109e5e34ea7b..247c5bc966255 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -41,7 +41,8 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
 
-        # Logging.
+        assert start_engine_loop
+
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [

From 249b9ff1eb8288f3fda1f05c9d54962a08787014 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 22:30:08 +0000
Subject: [PATCH 07/67] updated

---
 vllm/v1/engine/core.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 84dbe248e4d53..d65c1bee7643d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -28,8 +28,7 @@
 
 logger = init_logger(__name__)
 
-POLLING_TIMEOUT_MS = 2500
-POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
+POLLING_TIMEOUT_S = 2.5
 
 
 class EngineCore:

From c1f9292e7a14be2b07d392fe425f41e4f68984ab Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 11 Jan 2025 23:10:27 +0000
Subject: [PATCH 08/67] refactoring metrics

---
 vllm/v1/engine/async_llm.py   |  24 +--
 vllm/v1/engine/detokenizer.py | 273 ----------------------------------
 vllm/v1/engine/llm_engine.py  |  13 +-
 vllm/v1/metrics/loggers.py    |   1 -
 vllm/v1/metrics/stats.py      |   8 +
 5 files changed, 27 insertions(+), 292 deletions(-)
 delete mode 100644 vllm/v1/engine/detokenizer.py

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 247c5bc966255..013140bf6cd40 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -18,7 +18,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
@@ -72,7 +72,7 @@ def __init__(
         )
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
-        self.detokenizer = Detokenizer(
+        self.output_processor = OutputProcessor(
             tokenizer_name=vllm_config.model_config.tokenizer,
             tokenizer_mode=vllm_config.model_config.tokenizer_mode,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
@@ -152,7 +152,7 @@ async def add_request(
                                                 priority)
 
         # 3) Add the request to Detokenizer (this process).
-        self.detokenizer.add_request(request)
+        self.output_processor.add_request(request)
 
         # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(request)
@@ -251,20 +251,22 @@ async def _run_output_handler(self):
         try:
             while True:
                 # 1) Pull EngineCoreOutput from the EngineCore.
-                outputs = await self.engine_core.get_output_async()
+                engine_core_outputs = await self.engine_core.get_output_async()
 
                 # 2) Detokenize based on the output.
-                request_outputs, reqs_to_abort = self.detokenizer.step(
-                    outputs.outputs)
+                processed_outputs = self.output_processor.step(engine_core_outputs.outputs)
 
                 # 3) Put the RequestOutputs into the per-request queues.
-                self._process_request_outputs(request_outputs)
+                self._process_request_outputs(processed_outputs.request_outputs)
 
                 # 4) Abort any requests that finished due to stop strings.
-                await self.engine_core.abort_requests_async(reqs_to_abort)
+                await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort)
 
                 # 5) Log any stats.
-                await self._log_stats(scheduler_stats=outputs.scheduler_stats)
+                await self._log_stats(
+                    scheduler_stats=engine_core_outputs.scheduler_stats,
+                    iteration_stats=processed_outputs.iteration_stats,
+                )
 
         except Exception as e:
             logger.exception("EngineCore output handler hit an error: %s", e)
@@ -275,7 +277,7 @@ async def abort(self, request_id: str) -> None:
 
         request_ids = [request_id]
         await self.engine_core.abort_requests_async(request_ids)
-        self.detokenizer.abort_requests(request_ids)
+        self.output_processor.abort_requests(request_ids)
 
         # If a request finishes while we await then the request_id
         # will be removed from the tracked queues before we get here.
@@ -315,7 +317,7 @@ async def get_tokenizer(
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
         assert lora_request is None
-        return self.detokenizer.tokenizer
+        return self.output_processor.tokenizer
 
     async def is_tracing_enabled(self) -> bool:
         return False
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
deleted file mode 100644
index 65be9e58e03c8..0000000000000
--- a/vllm/v1/engine/detokenizer.py
+++ /dev/null
@@ -1,273 +0,0 @@
-from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
-
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import RequestOutputKind
-from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class IncrementalDetokenizer:
-
-    # Generation data
-    output_text: str
-    tokens: List[str]
-    token_ids: List[int]
-
-    # Stop strings
-    stop: List[str]
-    include_stop_str_in_output: bool
-
-    # Metadata for incremental detokenization
-    prefix_offset: int
-    read_offset: int
-
-    # Parameters for detokenization
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
-    output_kind: RequestOutputKind
-
-    # TODO: Probably decouple these
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-
-    # Tokenizer for this request
-    tokenizer: AnyTokenizer
-
-    # Accounting for stop string buffering
-    stop_buffer_length: int
-    _last_output_text_offset: int = 0
-
-    @property
-    def output_token_ids(self) -> List[int]:
-        assert len(self.token_ids) >= len(self.prompt_token_ids)
-        return self.token_ids[len(self.prompt_token_ids):]
-
-    @classmethod
-    def from_new_request(
-        cls,
-        tokenizer: AnyTokenizer,
-        request: EngineCoreRequest,
-    ) -> "IncrementalDetokenizer":
-
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=tokenizer,
-            prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-        )
-
-        stops = request.sampling_params.stop
-        # Number of chars to hold back when stop strings are to be excluded
-        # from streamed output.
-        if stops and not request.sampling_params.include_stop_str_in_output:
-            stop_buffer_length = max(len(s) for s in stops) - 1
-        else:
-            stop_buffer_length = 0
-
-        return cls(
-            output_text="",
-            tokens=tokens,
-            # Detokenizer mutates this list, so need a unique copy.
-            # NOTE(Nick): could we take ownership of it though?
-            token_ids=request.prompt_token_ids.copy(),
-            stop=stops,
-            include_stop_str_in_output=request.sampling_params.
-            include_stop_str_in_output,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-            spaces_between_special_tokens=request.sampling_params.
-            spaces_between_special_tokens,
-            output_kind=request.sampling_params.output_kind,
-            request_id=request.request_id,
-            prompt=request.prompt,
-            prompt_token_ids=request.prompt_token_ids,
-            tokenizer=tokenizer,
-            stop_buffer_length=stop_buffer_length,
-        )
-
-    def add_tokens(
-        self,
-        new_token_ids: List[int],
-        finish_reason: Optional[str],
-        stop_reason: Optional[Union[int, str, None]],
-    ) -> Optional[RequestOutput]:
-        """
-        Update RequestState for the request_id by:
-            1) Detokenize the new token ids incrementally.
-            2) Update the RequestOutput with the new text.
-        """
-
-        # 1) Detokenize the new token ids incrementally.
-        # TODO(woosuk): This method becomes very inefficient when the number of
-        # new_token_ids is more than 1. We need to optimize this.
-        decoded_text = ""
-        for new_token_id in new_token_ids:
-            self.token_ids.append(new_token_id)
-            (new_tokens, new_decoded_token_text, prefix_offset,
-             read_offset) = detokenize_incrementally(
-                 tokenizer=self.tokenizer,
-                 all_input_ids=self.token_ids,
-                 prev_tokens=self.tokens,
-                 prefix_offset=self.prefix_offset,
-                 read_offset=self.read_offset,
-                 skip_special_tokens=self.skip_special_tokens,
-                 spaces_between_special_tokens=self.
-                 spaces_between_special_tokens,
-             )
-
-            self.tokens.extend(new_tokens)
-            self.prefix_offset = prefix_offset
-            self.read_offset = read_offset
-            self.output_text += new_decoded_token_text
-
-            decoded_text += new_decoded_token_text
-
-        # 2) Evaluate stop criteria.
-        if self.stop:
-            stop = StopChecker.check_stop_strings(
-                output_text=self.output_text,
-                new_char_count=len(decoded_text),
-                stop=self.stop,
-                include_in_output=self.include_stop_str_in_output,
-            )
-            if stop is not None:
-                stop_str, truncate_to = stop
-                if truncate_to != -1:
-                    self.output_text = self.output_text[:truncate_to]
-                finish_reason = "stop"  # TODO: use constant
-                stop_reason = stop_str
-
-        # TODO: handle stop_token_ids here too?
-
-        # 3) Update the RequestOutput object with the new text.
-        finished = bool(finish_reason)
-        if self.output_kind == RequestOutputKind.FINAL_ONLY \
-            and not finished:
-            return None
-
-        delta = self.output_kind == RequestOutputKind.DELTA
-        output_text = self._get_next_output_text(finished, delta)
-        token_ids = new_token_ids if delta else self.output_token_ids
-
-        request_output = RequestOutput.new(
-            self.request_id,
-            self.prompt,
-            self.prompt_token_ids,
-            output_text,
-            token_ids,
-            finished,
-        )
-
-        if finished:
-            completion_output = request_output.outputs[0]
-            completion_output.finish_reason = finish_reason
-            completion_output.stop_reason = stop_reason
-
-        return request_output
-
-    def _get_next_output_text(self, finished: bool, delta: bool) -> str:
-        """If delta is True, only new text since the last call to
-        this method is returned"""
-
-        # We return the full output text if the sequence is finished.
-        buffer_length = 0 if finished else self.stop_buffer_length
-        if not delta:
-            return self.output_text[:-buffer_length] if buffer_length else (
-                self.output_text)
-        length = len(self.output_text) - buffer_length
-        last_offset = self._last_output_text_offset
-        if last_offset < length:
-            self._last_output_text_offset = length
-            return self.output_text[last_offset:length]
-        return ""
-
-
-class Detokenizer:
-
-    def __init__(self,
-                 tokenizer_name: str,
-                 tokenizer_mode: str = "auto",
-                 trust_remote_code: bool = False,
-                 revision: Optional[str] = None):
-        # TODO: once we support LoRA, we should should pass the tokenizer
-        # here. We currently have two copies (this + in the LLMEngine).
-        self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                                       tokenizer_mode=tokenizer_mode,
-                                       trust_remote_code=trust_remote_code,
-                                       revision=revision)
-
-        # Request id -> IncrementalDetokenizer
-        self.request_states: Dict[str, IncrementalDetokenizer] = {}
-
-    def is_request_active(self, request_id: str):
-        return request_id in self.request_states
-
-    def get_num_unfinished_requests(self):
-        return len(self.request_states)
-
-    def has_unfinished_requests(self) -> bool:
-        return len(self.request_states) > 0
-
-    def abort_requests(
-        self,
-        request_ids: Iterable[str],
-    ) -> None:
-        """Remove the request_ids from the Detokenizer."""
-
-        for request_id in request_ids:
-            self.request_states.pop(request_id, None)
-
-    def add_request(
-        self,
-        request: EngineCoreRequest,
-    ):
-        """Add new request to the Detokenizer."""
-
-        assert (request.request_id not in self.request_states)
-
-        request_state = IncrementalDetokenizer.from_new_request(
-            self.tokenizer, request)
-        self.request_states[request.request_id] = request_state
-
-    def step(
-        self, encore_core_outputs: List[EngineCoreOutput]
-    ) -> Tuple[List[RequestOutput], List[str]]:
-        """Update state and request the RequestOutputs to the LLMEngine."""
-
-        request_outputs: List[RequestOutput] = []
-        requests_to_abort: List[str] = []
-        for engine_core_output in encore_core_outputs:
-            request_id = engine_core_output.request_id
-            detokenizer = self.request_states.get(request_id)
-            if detokenizer is None:
-                # Ignore output for already-aborted request.
-                continue
-
-            # Detokenize and update state.
-            request_output = detokenizer.add_tokens(
-                new_token_ids=engine_core_output.new_token_ids,
-                finish_reason=engine_core_output.finish_reason,
-                stop_reason=engine_core_output.stop_reason,
-            )
-
-            if request_output is not None:
-                # Add to RequestOutputs list.
-                request_outputs.append(request_output)
-
-                # Free completed requests.
-                if request_output.finished:
-                    self.request_states.pop(request_id)
-                    if not engine_core_output.finished:
-                        requests_to_abort.append(request_id)
-
-        # Return to EngineClient.
-        return request_outputs, requests_to_abort
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7a75623aa5bf1..27321752c3efa 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -18,7 +18,7 @@
     BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -42,7 +42,6 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-        assert log_stats is False
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
@@ -62,7 +61,7 @@ def __init__(
                                    mm_registry=mm_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
-        self.detokenizer = Detokenizer(
+        self.output_processor = OutputProcessor(
             tokenizer_name=vllm_config.model_config.tokenizer,
             tokenizer_mode=vllm_config.model_config.tokenizer_mode,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
@@ -104,10 +103,10 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     def get_num_unfinished_requests(self) -> int:
-        return self.detokenizer.get_num_unfinished_requests()
+        return self.output_processor.get_num_unfinished_requests()
 
     def has_unfinished_requests(self) -> bool:
-        return self.detokenizer.has_unfinished_requests()
+        return self.output_processor.has_unfinished_requests()
 
     @classmethod
     def validate_outputs(cls, outputs, output_type):
@@ -117,7 +116,7 @@ def abort_request(self, request_ids: List[str]) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
         self.engine_core.abort_requests(request_ids)
-        self.detokenizer.abort_requests(request_ids)
+        self.output_processor.abort_requests(request_ids)
 
     def add_request(
         self,
@@ -139,7 +138,7 @@ def add_request(
                                                 priority)
 
         # 2) Add the request to Detokenizer.
-        self.detokenizer.add_request(request)
+        self.output_processor.add_request(request)
 
         # 3) Add the request to EngineCore.
         self.engine_core.add_request(request)
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 8feeef17542e6..4ed9193128647 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -22,7 +22,6 @@ def __init__(self):
         self.last_log_time = time.monotonic()
 
     def log(self, scheduler_stats: SchedulerStats):
-        """Log Stats to standard output."""
 
         # Log every _LOCAL_LOGGING_INTERVAL_SEC.
         now = time.monotonic()
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 5ebb4fd5b37db..d9c7aa400005a 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -10,3 +10,11 @@ class SchedulerStats:
 
     # gpu_cache_usage: float = 0.0
     # gpu_prefix_cache_hit_rate: float = 0.0
+
+
+@dataclass
+class IterationStats:
+    """Stats associated with a single iteration"""
+
+    num_generation_tokens: int = 0
+    num_prompt_tokens: int = 0

From c641866d2454a5d2e7ec55a38eeb9b51e94fb4ea Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 14:12:00 +0000
Subject: [PATCH 09/67] updated

---
 vllm/v1/engine/llm_engine.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7a75623aa5bf1..ac392f5e4f4cf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -42,7 +42,6 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
-        assert log_stats is False
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).

From 1ce7a5fcf75f4a4eec620ed50189e06b5431ac47 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 14:13:37 +0000
Subject: [PATCH 10/67] updated

---
 vllm/v1/engine/async_llm.py | 2 +-
 vllm/v1/engine/core.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 247c5bc966255..e0ceb59dffcbd 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -47,7 +47,7 @@ def __init__(
         self.log_stats = log_stats
         self.stat_loggers: List[StatLoggerBase] = [
             LoggingStatLogger(),
-            # PrometheusStatLogger(),
+            # TODO(rob): PrometheusStatLogger(),
         ]
         self.model_config = vllm_config.model_config
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index d65c1bee7643d..e7f90d3c62142 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -214,10 +214,10 @@ def run_busy_loop(self):
                         self._handle_client_request(req)
                         break
                     except queue.Empty:
-                        # Break out the loops so we can log_stats via step().
+                        logger.debug("EngineCore busy loop waiting.")
+                        # Break out the loop so we can log_stats in step().
                         if self.log_stats:
                             break
-                        logger.debug("EngineCore busy loop waiting.")
                     except BaseException:
                         raise
 

From f8de299c7c741dbbec83abbf5132b76bc7fda701 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 14:16:36 +0000
Subject: [PATCH 11/67] added output processor

---
 vllm/v1/engine/output_processor.py | 286 +++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 vllm/v1/engine/output_processor.py

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
new file mode 100644
index 0000000000000..e2fcf29cf4915
--- /dev/null
+++ b/vllm/v1/engine/output_processor.py
@@ -0,0 +1,286 @@
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import RequestOutputKind
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.metrics.stats import IterationStats
+
+logger = init_logger(__name__)
+
+@dataclass
+class OutputProcessorOutput:
+    """Output from the OutputProcessor."""
+
+    request_outputs: List[RequestOutput]
+    requests_to_abort: List[str]
+    iteration_stats: IterationStats
+
+@dataclass
+class IncrementalDetokenizer:
+
+    # Generation data
+    output_text: str
+    tokens: List[str]
+    token_ids: List[int]
+
+    # Stop strings
+    stop: List[str]
+    include_stop_str_in_output: bool
+
+    # Metadata for incremental detokenization
+    prefix_offset: int
+    read_offset: int
+
+    # Parameters for detokenization
+    skip_special_tokens: bool
+    spaces_between_special_tokens: bool
+    output_kind: RequestOutputKind
+
+    # TODO: Probably decouple these
+    request_id: str
+    prompt: Optional[str]
+    prompt_token_ids: List[int]
+
+    # Tokenizer for this request
+    tokenizer: AnyTokenizer
+
+    # Accounting for stop string buffering
+    stop_buffer_length: int
+    _last_output_text_offset: int = 0
+
+    @property
+    def output_token_ids(self) -> List[int]:
+        assert len(self.token_ids) >= len(self.prompt_token_ids)
+        return self.token_ids[len(self.prompt_token_ids):]
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+    ) -> "IncrementalDetokenizer":
+
+        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
+            tokenizer=tokenizer,
+            prompt_ids=request.prompt_token_ids,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+        )
+
+        stops = request.sampling_params.stop
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if stops and not request.sampling_params.include_stop_str_in_output:
+            stop_buffer_length = max(len(s) for s in stops) - 1
+        else:
+            stop_buffer_length = 0
+
+        return cls(
+            output_text="",
+            tokens=tokens,
+            # Detokenizer mutates this list, so need a unique copy.
+            # NOTE(Nick): could we take ownership of it though?
+            token_ids=request.prompt_token_ids.copy(),
+            stop=stops,
+            include_stop_str_in_output=request.sampling_params.
+            include_stop_str_in_output,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+            spaces_between_special_tokens=request.sampling_params.
+            spaces_between_special_tokens,
+            output_kind=request.sampling_params.output_kind,
+            request_id=request.request_id,
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            tokenizer=tokenizer,
+            stop_buffer_length=stop_buffer_length,
+        )
+
+    def add_tokens(
+        self,
+        new_token_ids: List[int],
+        finish_reason: Optional[str],
+        stop_reason: Optional[Union[int, str, None]],
+    ) -> Optional[RequestOutput]:
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally.
+            2) Update the RequestOutput with the new text.
+        """
+
+        # 1) Detokenize the new token ids incrementally.
+        # TODO(woosuk): This method becomes very inefficient when the number of
+        # new_token_ids is more than 1. We need to optimize this.
+        decoded_text = ""
+        for new_token_id in new_token_ids:
+            self.token_ids.append(new_token_id)
+            (new_tokens, new_decoded_token_text, prefix_offset,
+             read_offset) = detokenize_incrementally(
+                 tokenizer=self.tokenizer,
+                 all_input_ids=self.token_ids,
+                 prev_tokens=self.tokens,
+                 prefix_offset=self.prefix_offset,
+                 read_offset=self.read_offset,
+                 skip_special_tokens=self.skip_special_tokens,
+                 spaces_between_special_tokens=self.
+                 spaces_between_special_tokens,
+             )
+
+            self.tokens.extend(new_tokens)
+            self.prefix_offset = prefix_offset
+            self.read_offset = read_offset
+            self.output_text += new_decoded_token_text
+
+            decoded_text += new_decoded_token_text
+
+        # 2) Evaluate stop criteria.
+        if self.stop:
+            stop = StopChecker.check_stop_strings(
+                output_text=self.output_text,
+                new_char_count=len(decoded_text),
+                stop=self.stop,
+                include_in_output=self.include_stop_str_in_output,
+            )
+            if stop is not None:
+                stop_str, truncate_to = stop
+                if truncate_to != -1:
+                    self.output_text = self.output_text[:truncate_to]
+                finish_reason = "stop"  # TODO: use constant
+                stop_reason = stop_str
+
+        # TODO: handle stop_token_ids here too?
+
+        # 3) Update the RequestOutput object with the new text.
+        finished = bool(finish_reason)
+        if self.output_kind == RequestOutputKind.FINAL_ONLY \
+            and not finished:
+            return None
+
+        delta = self.output_kind == RequestOutputKind.DELTA
+        output_text = self._get_next_output_text(finished, delta)
+        token_ids = new_token_ids if delta else self.output_token_ids
+
+        request_output = RequestOutput.new(
+            self.request_id,
+            self.prompt,
+            self.prompt_token_ids,
+            output_text,
+            token_ids,
+            finished,
+        )
+
+        if finished:
+            completion_output = request_output.outputs[0]
+            completion_output.finish_reason = finish_reason
+            completion_output.stop_reason = stop_reason
+
+        return request_output
+
+    def _get_next_output_text(self, finished: bool, delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
+        # We return the full output text if the sequence is finished.
+        buffer_length = 0 if finished else self.stop_buffer_length
+        if not delta:
+            return self.output_text[:-buffer_length] if buffer_length else (
+                self.output_text)
+        length = len(self.output_text) - buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+
+
+class OutputProcessor:
+
+    def __init__(self,
+                 tokenizer_name: str,
+                 tokenizer_mode: str = "auto",
+                 trust_remote_code: bool = False,
+                 revision: Optional[str] = None):
+        # TODO: once we support LoRA, we should should pass the tokenizer
+        # here. We currently have two copies (this + in the LLMEngine).
+        self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                                       tokenizer_mode=tokenizer_mode,
+                                       trust_remote_code=trust_remote_code,
+                                       revision=revision)
+
+        # Request id -> IncrementalDetokenizer
+        self.request_states: Dict[str, IncrementalDetokenizer] = {}
+
+    def is_request_active(self, request_id: str):
+        return request_id in self.request_states
+
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def abort_requests(
+        self,
+        request_ids: Iterable[str],
+    ) -> None:
+        """Remove the request_ids from the Detokenizer."""
+
+        for request_id in request_ids:
+            self.request_states.pop(request_id, None)
+
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+    ):
+        """Add new request to the Detokenizer."""
+
+        assert (request.request_id not in self.request_states)
+
+        request_state = IncrementalDetokenizer.from_new_request(
+            self.tokenizer, request)
+        self.request_states[request.request_id] = request_state
+
+    def step(
+        self, encore_core_outputs: List[EngineCoreOutput]
+    ) -> OutputProcessorOutput:
+        """Update state and request the RequestOutputs to the LLMEngine."""
+
+        iteraton_stats = IterationStats()
+        request_outputs: List[RequestOutput] = []
+        requests_to_abort: List[str] = []
+        for engine_core_output in encore_core_outputs:
+            request_id = engine_core_output.request_id
+            detokenizer = self.request_states.get(request_id)
+            if detokenizer is None:
+                # Ignore output for already-aborted request.
+                continue
+
+            # Detokenize and update state.
+            request_output = detokenizer.add_tokens(
+                new_token_ids=engine_core_output.new_token_ids,
+                finish_reason=engine_core_output.finish_reason,
+                stop_reason=engine_core_output.stop_reason,
+            )
+
+            if request_output is not None:
+                # Add to RequestOutputs list.
+                request_outputs.append(request_output)
+
+                # Free completed requests.
+                if request_output.finished:
+                    self.request_states.pop(request_id)
+                    if not engine_core_output.finished:
+                        requests_to_abort.append(request_id)
+
+        # Return to EngineClient.
+        return OutputProcessorOutput(
+            request_outputs=request_outputs,
+            requests_to_abort=requests_to_abort,
+            iteration_stats=iteraton_stats,
+        )

From 49ca9bbdd5a53268cfb6c894978d3d00e477e149 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 18:44:21 +0000
Subject: [PATCH 12/67] added all files

---
 vllm/v1/engine/async_llm.py        | 117 +++++-----
 vllm/v1/engine/core_client.py      |   6 +-
 vllm/v1/engine/detokenizer.py      | 180 ++++++++++++++++
 vllm/v1/engine/llm_engine.py       |  68 +++---
 vllm/v1/engine/output_processor.py | 332 ++++++++---------------------
 vllm/v1/engine/processor.py        |   5 +-
 vllm/v1/engine/request_state.py    |  46 ++++
 vllm/v1/metrics/stats.py           |  24 ++-
 8 files changed, 428 insertions(+), 350 deletions(-)
 create mode 100644 vllm/v1/engine/detokenizer.py
 create mode 100644 vllm/v1/engine/request_state.py

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d5563b5d481cf..d7068d1262843 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -20,9 +20,10 @@
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
+from vllm.v1.engine.request_state import RequestState
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
 
@@ -52,31 +53,29 @@ def __init__(
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
-        self.tokenizer = init_tokenizer_from_configs(
+        self.tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
+        self.tokenizer_group.ping()
 
-        # Request streams (map of request_id -> queue).
-        self.rid_to_queue: Dict[str, asyncio.Queue] = {}
+        # Request States (map of request_id -> RequestState).
+        self.request_states: Dict[str, RequestState] = {}
 
-        # Processor (converts Inputs --> EngineCoreRequests).
+        # Processor (convert Inputs --> EngineCoreRequests).
         self.processor = Processor(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
-            tokenizer=self.tokenizer,
+            tokenizer_group=self.tokenizer_group,
             input_registry=input_registry,
         )
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
+        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(
-            tokenizer_name=vllm_config.model_config.tokenizer,
-            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-            revision=vllm_config.model_config.tokenizer_revision,
+            request_states=self.request_states,
+            log_stats=self.log_stats,
         )
 
         # EngineCore (starts the engine in background process).
@@ -139,28 +138,31 @@ async def add_request(
     ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        # 1) Create a new output queue for the request.
-        if request_id in self.rid_to_queue:
+        if request_id in self.request_states:
             raise ValueError(f"Request id {request_id} already running.")
-        self.rid_to_queue[request_id] = asyncio.Queue()
 
-        # 2) Convert Input --> Request.
+        # 1) Convert Input --> Request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
                                                 trace_headers,
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 3) Add the request to Detokenizer (this process).
-        self.output_processor.add_request(request)
+        # 2) Make a nnew RequestState and queue.
+        queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
+        self.request_states[request_id] = RequestState.from_new_request(
+            tokenizer=self.get_tokenizer(),
+            request=request,
+            queue=queue,
+        )
 
-        # 4) Add the EngineCoreRequest to EngineCore (separate process).
+        # 3) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(request)
 
         if self.log_requests:
             logger.info("Added request %s.", request_id)
 
-        return self.rid_to_queue[request_id]
+        return queue
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
@@ -197,8 +199,7 @@ async def generate(
             # we can call __init__ before the event loop, which enables us
             # to handle startup failure gracefully in the OpenAI server.
             if self.output_handler is None:
-                self.output_handler = asyncio.create_task(
-                    self._run_output_handler())
+                self.output_handler = asyncio.create_task(self.step_async())
 
             q = await self.add_request(
                 request_id,
@@ -217,53 +218,40 @@ async def generate(
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
-                # Note: both Detokenizer and EngineCore handle their
-                # own request cleanup based on finished.
+                # Note: OutputProcessor removes from request_states.
                 if out.finished:
-                    del self.rid_to_queue[request_id]
                     yield out
                     break
 
                 yield out
 
-        # If the request is disconnected by the client, the
-        # generate() task will be canceled. So, we abort the
-        # request if we end up here.
+        # If the request is disconnected by the client, the generate()
+        # task will be canceled. So, we abort the request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise
 
-    def _process_request_outputs(self, request_outputs: List[RequestOutput]):
-        """Process outputs by putting them into per-request queues."""
-
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-
-            # Note: it is possible a request was aborted and removed from
-            # the state due to client cancellations, so if we encounter a
-            # request id not in the state, we skip.
-            if request_id in self.rid_to_queue:
-                self.rid_to_queue[request_id].put_nowait(request_output)
-
-    async def _run_output_handler(self):
-        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+    async def step_async(self):
+        """Busy loop: Pull From EngineCore -> Process -> Push to Queues"""
 
         try:
             while True:
-                # 1) Pull EngineCoreOutput from the EngineCore.
+                # 1) Pull EngineCoreOutputs from the EngineCore.
                 engine_core_outputs = await self.engine_core.get_output_async()
 
-                # 2) Detokenize based on the output.
-                processed_outputs = self.output_processor.step(engine_core_outputs.outputs)
+                # 2) Process EngineCoreOutputs, pushing RequestOutputs into
+                # asyncio queues for handling by the per-req generate() task.
+                processed_outputs = self.output_processor.process_outputs(
+                    engine_core_outputs)
 
-                # 3) Put the RequestOutputs into the per-request queues.
-                self._process_request_outputs(processed_outputs.request_outputs)
+                # 3) Abort any reqs that finished due to stop strings.
+                await self.engine_core.abort_requests_async(
+                    processed_outputs.reqs_to_abort)
 
-                # 4) Abort any requests that finished due to stop strings.
-                await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort)
-
-                # 5) Log any stats.
-                await self._log_stats(
+                # 4) Logging.
+                # TODO(rob): make into a coroutine and launch it in
+                # background thread once we add Prometheus.
+                self._log_stats(
                     scheduler_stats=engine_core_outputs.scheduler_stats,
                     iteration_stats=processed_outputs.iteration_stats,
                 )
@@ -273,19 +261,21 @@ async def _run_output_handler(self):
             kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
-        """Abort RequestId in self, detokenizer, and engine core."""
+        """Abort a Request."""
 
-        request_ids = [request_id]
-        await self.engine_core.abort_requests_async(request_ids)
-        self.output_processor.abort_requests(request_ids)
+        # Remove from EngineCore.
+        await self.engine_core.abort_requests_async([request_id])
 
-        # If a request finishes while we await then the request_id
-        # will be removed from the tracked queues before we get here.
-        if request_id in self.rid_to_queue:
-            del self.rid_to_queue[request_id]
+        # Remove from AsyncLLM.
+        # Note: the request can finish during await, so check to make
+        # sure it is still active in the tracker before we pop.
+        _ = self.request_states.pop(request_id, None)
 
-    async def _log_stats(self, scheduler_stats: SchedulerStats):
-        """Log stats to the stat loggers."""
+    def _log_stats(
+        self,
+        scheduler_stats: SchedulerStats,
+        iteration_stats: IterationStats,
+    ):
         if not self.log_stats:
             return
 
@@ -316,8 +306,7 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
-        assert lora_request is None
-        return self.output_processor.tokenizer
+        return self.tokenizer_group.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
         return False
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 176fa839c8f58..1c680b83da065 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -105,7 +105,8 @@ def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
-        self.engine_core.abort_requests(request_ids)
+        if len(request_ids) > 0:
+            self.engine_core.abort_requests(request_ids)
 
     def shutdown(self):
         self.engine_core.shutdown()
@@ -218,7 +219,8 @@ def add_request(self, request: EngineCoreRequest) -> None:
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: List[str]) -> None:
-        self._send_input(EngineCoreRequestType.ABORT, request_ids)
+        if len(request_ids) > 0:
+            self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
         self._send_input(EngineCoreRequestType.PROFILE,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
new file mode 100644
index 0000000000000..7f6c8036f7890
--- /dev/null
+++ b/vllm/v1/engine/detokenizer.py
@@ -0,0 +1,180 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import RequestOutputKind
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+
+
+logger = init_logger(__name__)
+
+@dataclass
+class DetokenizerOutput:
+    output_text: str
+    token_ids: List[int]
+    finished: bool
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+
+
+@dataclass
+class Detokenizer:
+
+    # Generation data
+    output_text: str
+    tokens: List[str]
+    token_ids: List[int]
+    # Length of the prompt (in token_ids)
+    prompt_len: int
+
+    # Stop strings
+    stop: List[str]
+    include_stop_str_in_output: bool
+
+    # Metadata for incremental detokenization
+    prefix_offset: int
+    read_offset: int
+
+    # Parameters for detokenization
+    skip_special_tokens: bool
+    spaces_between_special_tokens: bool
+    output_kind: RequestOutputKind
+
+    # Tokenizer for this request
+    tokenizer: AnyTokenizer
+
+    # Accounting for stop string buffering
+    stop_buffer_length: int
+    _last_output_text_offset: int = 0
+
+    @property
+    def output_token_ids(self) -> List[int]:
+        return self.token_ids[self.prompt_len:]
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+    ) -> "Detokenizer":
+
+        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
+            tokenizer=tokenizer,
+            prompt_ids=request.prompt_token_ids,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+        )
+
+        stops = request.sampling_params.stop
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if stops and not request.sampling_params.include_stop_str_in_output:
+            stop_buffer_length = max(len(s) for s in stops) - 1
+        else:
+            stop_buffer_length = 0
+
+        return cls(
+            output_text="",
+            tokens=tokens,
+            # Detokenizer mutates this list, so need a unique copy.
+            # NOTE(Nick): could we take ownership of it though?
+            token_ids=request.prompt_token_ids.copy(),
+            stop=stops,
+            include_stop_str_in_output=request.sampling_params.
+            include_stop_str_in_output,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+            spaces_between_special_tokens=request.sampling_params.
+            spaces_between_special_tokens,
+            output_kind=request.sampling_params.output_kind,
+            prompt_len=len(request.prompt_token_ids),
+            tokenizer=tokenizer,
+            stop_buffer_length=stop_buffer_length,
+        )
+
+    def update_from_output(
+        self,
+        output: EngineCoreOutput,
+    ) -> Optional[DetokenizerOutput]:
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally.
+            2) Update the RequestOutput with the new text.
+        """
+
+        new_token_ids = output.new_token_ids
+        finish_reason = output.finish_reason
+        stop_reason = output.stop_reason
+
+        # 1) Detokenize the new token ids incrementally.
+        # TODO(woosuk): This method becomes very inefficient when the number of
+        # new_token_ids is more than 1. We need to optimize this.
+        decoded_text = ""
+        for new_token_id in output.new_token_ids:
+            self.token_ids.append(new_token_id)
+            (new_tokens, new_decoded_token_text, prefix_offset,
+             read_offset) = detokenize_incrementally(
+                 tokenizer=self.tokenizer,
+                 all_input_ids=self.token_ids,
+                 prev_tokens=self.tokens,
+                 prefix_offset=self.prefix_offset,
+                 read_offset=self.read_offset,
+                 skip_special_tokens=self.skip_special_tokens,
+                 spaces_between_special_tokens=self.
+                 spaces_between_special_tokens,
+             )
+
+            self.tokens.extend(new_tokens)
+            self.prefix_offset = prefix_offset
+            self.read_offset = read_offset
+            self.output_text += new_decoded_token_text
+
+            decoded_text += new_decoded_token_text
+
+        # 2) Evaluate stop criteria.
+        if self.stop:
+            stop = StopChecker.check_stop_strings(
+                output_text=self.output_text,
+                new_char_count=len(decoded_text),
+                stop=self.stop,
+                include_in_output=self.include_stop_str_in_output,
+            )
+            if stop is not None:
+                stop_str, truncate_to = stop
+                if truncate_to != -1:
+                    self.output_text = self.output_text[:truncate_to]
+                finish_reason = "stop"  # TODO: use constant
+                stop_reason = stop_str
+
+        # 3) Update the RequestOutput object with the new text.
+        finished = bool(finish_reason)
+        if self.output_kind == RequestOutputKind.FINAL_ONLY \
+            and not finished:
+            return None
+
+        delta = self.output_kind == RequestOutputKind.DELTA
+        output_text = self._get_next_output_text(finished, delta)
+        token_ids = new_token_ids if delta else self.output_token_ids
+
+        return DetokenizerOutput(
+            output_text, token_ids, finished, finish_reason, stop_reason)
+
+
+    def _get_next_output_text(self, finished: bool, delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
+        # We return the full output text if the sequence is finished.
+        buffer_length = 0 if finished else self.stop_buffer_length
+        if not delta:
+            return self.output_text[:-buffer_length] if buffer_length else (
+                self.output_text)
+        length = len(self.output_text) - buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 27321752c3efa..5a6f75ea70f2a 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -15,11 +15,12 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import (
-    BaseTokenizerGroup, init_tokenizer_from_configs)
+    AnyTokenizer, BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
+from vllm.v1.engine.request_state import RequestState
 from vllm.v1.executor.abstract import Executor
 
 logger = init_logger(__name__)
@@ -45,27 +46,28 @@ def __init__(
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
-        self.tokenizer = init_tokenizer_from_configs(
+        self.tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
+        self.tokenizer_group.ping()
+
+        # Request States (map of request_id -> RequestState).
+        self.request_states: Dict[str, RequestState] = {}
 
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(model_config=vllm_config.model_config,
                                    cache_config=vllm_config.cache_config,
                                    lora_config=vllm_config.lora_config,
-                                   tokenizer=self.tokenizer,
+                                   tokenizer_group=self.tokenizer_group,
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
 
-        # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
+        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(
-            tokenizer_name=vllm_config.model_config.tokenizer,
-            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-            revision=vllm_config.model_config.tokenizer_revision,
+            request_states=self.request_states,
+            log_stats=False,
         )
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
@@ -103,21 +105,15 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     def get_num_unfinished_requests(self) -> int:
-        return self.output_processor.get_num_unfinished_requests()
+        return len(self.request_states)
 
     def has_unfinished_requests(self) -> bool:
-        return self.output_processor.has_unfinished_requests()
+        return self.get_num_unfinished_requests() > 0
 
     @classmethod
     def validate_outputs(cls, outputs, output_type):
         return outputs
 
-    def abort_request(self, request_ids: List[str]) -> None:
-        """Remove request_ids from EngineCore and Detokenizer."""
-
-        self.engine_core.abort_requests(request_ids)
-        self.output_processor.abort_requests(request_ids)
-
     def add_request(
         self,
         request_id: str,
@@ -130,33 +126,39 @@ def add_request(
         priority: int = 0,
     ) -> None:
 
-        # 1) Process raw inputs into the request.
+        if request_id in self.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
+
+        # 1) Convert Input --> Request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
                                                 trace_headers,
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 2) Add the request to Detokenizer.
-        self.output_processor.add_request(request)
+        # 2) Make a new RequestState and queue.
+        self.request_states[request_id] = RequestState.from_new_request(
+            tokenizer=self.get_tokenizer(),
+            request=request,
+        )
 
         # 3) Add the request to EngineCore.
         self.engine_core.add_request(request)
 
     def step(self) -> List[RequestOutput]:
+        """Pull From EngineCore -> Process -> Return RequestOutput."""
 
-        # 1) Get EngineCoreOutput from the EngineCore.
-        outputs = self.engine_core.get_output()
+        # 1) Pull EngineCoreOutput from the EngineCore.
+        engine_core_outputs = self.engine_core.get_output()
 
-        # 2) Detokenizer the EngineCoreOutput.
-        request_outputs, requests_to_abort = self.detokenizer.step(
-            outputs.outputs)
+        # 2) Process EngineCoreOutputs.
+        processed_outputs = self.output_processor.process_outputs(
+            engine_core_outputs)
 
-        # 3) Abort requests that finished due to stopping criteria.
-        if requests_to_abort:
-            self.abort_request(requests_to_abort)
+        # 3) Abort any reqs that finished due to stop strings.
+        self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
-        return request_outputs
+        return processed_outputs.request_outputs
 
     def get_model_config(self):
         return self.model_config
@@ -171,7 +173,7 @@ def get_tokenizer_group(
         self,
         group_type: Type[_G] = BaseTokenizerGroup,
     ) -> _G:
-        tokenizer_group = self.tokenizer
+        tokenizer_group = self.tokenizer_group
 
         if tokenizer_group is None:
             raise ValueError("Unable to get tokenizer because "
@@ -182,3 +184,9 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
+
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index e2fcf29cf4915..755e2790805e2 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,286 +1,124 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, List, Optional
 
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
-from vllm.sampling_params import RequestOutputKind
-from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine import EngineCoreOutputs
+from vllm.v1.engine.detokenizer import DetokenizerOutput
+from vllm.v1.engine.request_state import RequestState
 from vllm.v1.metrics.stats import IterationStats
 
-logger = init_logger(__name__)
 
 @dataclass
 class OutputProcessorOutput:
-    """Output from the OutputProcessor."""
+    """Output of the OutputProcessor.step() function."""
 
     request_outputs: List[RequestOutput]
-    requests_to_abort: List[str]
+    reqs_to_abort: List[str]
     iteration_stats: IterationStats
 
-@dataclass
-class IncrementalDetokenizer:
-
-    # Generation data
-    output_text: str
-    tokens: List[str]
-    token_ids: List[int]
-
-    # Stop strings
-    stop: List[str]
-    include_stop_str_in_output: bool
-
-    # Metadata for incremental detokenization
-    prefix_offset: int
-    read_offset: int
-
-    # Parameters for detokenization
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
-    output_kind: RequestOutputKind
-
-    # TODO: Probably decouple these
-    request_id: str
-    prompt: Optional[str]
-    prompt_token_ids: List[int]
-
-    # Tokenizer for this request
-    tokenizer: AnyTokenizer
-
-    # Accounting for stop string buffering
-    stop_buffer_length: int
-    _last_output_text_offset: int = 0
-
-    @property
-    def output_token_ids(self) -> List[int]:
-        assert len(self.token_ids) >= len(self.prompt_token_ids)
-        return self.token_ids[len(self.prompt_token_ids):]
-
-    @classmethod
-    def from_new_request(
-        cls,
-        tokenizer: AnyTokenizer,
-        request: EngineCoreRequest,
-    ) -> "IncrementalDetokenizer":
-
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=tokenizer,
-            prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-        )
 
-        stops = request.sampling_params.stop
-        # Number of chars to hold back when stop strings are to be excluded
-        # from streamed output.
-        if stops and not request.sampling_params.include_stop_str_in_output:
-            stop_buffer_length = max(len(s) for s in stops) - 1
-        else:
-            stop_buffer_length = 0
+class OutputProcessor:
 
-        return cls(
-            output_text="",
-            tokens=tokens,
-            # Detokenizer mutates this list, so need a unique copy.
-            # NOTE(Nick): could we take ownership of it though?
-            token_ids=request.prompt_token_ids.copy(),
-            stop=stops,
-            include_stop_str_in_output=request.sampling_params.
-            include_stop_str_in_output,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-            spaces_between_special_tokens=request.sampling_params.
-            spaces_between_special_tokens,
-            output_kind=request.sampling_params.output_kind,
-            request_id=request.request_id,
-            prompt=request.prompt,
-            prompt_token_ids=request.prompt_token_ids,
-            tokenizer=tokenizer,
-            stop_buffer_length=stop_buffer_length,
-        )
+    def __init__(
+        self,
+        request_states: Dict[str, RequestState],
+        log_stats: bool,
+    ):
+        self.request_states = request_states
+        self.log_stats = log_stats
 
-    def add_tokens(
+    def make_request_output(
         self,
-        new_token_ids: List[int],
-        finish_reason: Optional[str],
-        stop_reason: Optional[Union[int, str, None]],
+        request_state: RequestState,
+        detokenizer_output: Optional[DetokenizerOutput],
     ) -> Optional[RequestOutput]:
-        """
-        Update RequestState for the request_id by:
-            1) Detokenize the new token ids incrementally.
-            2) Update the RequestOutput with the new text.
-        """
-
-        # 1) Detokenize the new token ids incrementally.
-        # TODO(woosuk): This method becomes very inefficient when the number of
-        # new_token_ids is more than 1. We need to optimize this.
-        decoded_text = ""
-        for new_token_id in new_token_ids:
-            self.token_ids.append(new_token_id)
-            (new_tokens, new_decoded_token_text, prefix_offset,
-             read_offset) = detokenize_incrementally(
-                 tokenizer=self.tokenizer,
-                 all_input_ids=self.token_ids,
-                 prev_tokens=self.tokens,
-                 prefix_offset=self.prefix_offset,
-                 read_offset=self.read_offset,
-                 skip_special_tokens=self.skip_special_tokens,
-                 spaces_between_special_tokens=self.
-                 spaces_between_special_tokens,
-             )
-
-            self.tokens.extend(new_tokens)
-            self.prefix_offset = prefix_offset
-            self.read_offset = read_offset
-            self.output_text += new_decoded_token_text
-
-            decoded_text += new_decoded_token_text
-
-        # 2) Evaluate stop criteria.
-        if self.stop:
-            stop = StopChecker.check_stop_strings(
-                output_text=self.output_text,
-                new_char_count=len(decoded_text),
-                stop=self.stop,
-                include_in_output=self.include_stop_str_in_output,
-            )
-            if stop is not None:
-                stop_str, truncate_to = stop
-                if truncate_to != -1:
-                    self.output_text = self.output_text[:truncate_to]
-                finish_reason = "stop"  # TODO: use constant
-                stop_reason = stop_str
-
-        # TODO: handle stop_token_ids here too?
 
-        # 3) Update the RequestOutput object with the new text.
-        finished = bool(finish_reason)
-        if self.output_kind == RequestOutputKind.FINAL_ONLY \
-            and not finished:
+        if detokenizer_output is None:
             return None
 
-        delta = self.output_kind == RequestOutputKind.DELTA
-        output_text = self._get_next_output_text(finished, delta)
-        token_ids = new_token_ids if delta else self.output_token_ids
-
         request_output = RequestOutput.new(
-            self.request_id,
-            self.prompt,
-            self.prompt_token_ids,
-            output_text,
-            token_ids,
-            finished,
+            request_state.request_id,
+            request_state.prompt,
+            request_state.prompt_token_ids,
+            detokenizer_output.output_text,
+            detokenizer_output.token_ids,
+            detokenizer_output.finished,
         )
-
-        if finished:
+        if detokenizer_output.finished:
             completion_output = request_output.outputs[0]
-            completion_output.finish_reason = finish_reason
-            completion_output.stop_reason = stop_reason
+            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.stop_reason = detokenizer_output.stop_reason
 
         return request_output
 
-    def _get_next_output_text(self, finished: bool, delta: bool) -> str:
-        """If delta is True, only new text since the last call to
-        this method is returned"""
-
-        # We return the full output text if the sequence is finished.
-        buffer_length = 0 if finished else self.stop_buffer_length
-        if not delta:
-            return self.output_text[:-buffer_length] if buffer_length else (
-                self.output_text)
-        length = len(self.output_text) - buffer_length
-        last_offset = self._last_output_text_offset
-        if last_offset < length:
-            self._last_output_text_offset = length
-            return self.output_text[last_offset:length]
-        return ""
-
-
-class OutputProcessor:
-
-    def __init__(self,
-                 tokenizer_name: str,
-                 tokenizer_mode: str = "auto",
-                 trust_remote_code: bool = False,
-                 revision: Optional[str] = None):
-        # TODO: once we support LoRA, we should should pass the tokenizer
-        # here. We currently have two copies (this + in the LLMEngine).
-        self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
-                                       tokenizer_mode=tokenizer_mode,
-                                       trust_remote_code=trust_remote_code,
-                                       revision=revision)
-
-        # Request id -> IncrementalDetokenizer
-        self.request_states: Dict[str, IncrementalDetokenizer] = {}
-
-    def is_request_active(self, request_id: str):
-        return request_id in self.request_states
-
-    def get_num_unfinished_requests(self):
-        return len(self.request_states)
-
-    def has_unfinished_requests(self) -> bool:
-        return len(self.request_states) > 0
-
-    def abort_requests(
-        self,
-        request_ids: Iterable[str],
-    ) -> None:
-        """Remove the request_ids from the Detokenizer."""
-
-        for request_id in request_ids:
-            self.request_states.pop(request_id, None)
-
-    def add_request(
-        self,
-        request: EngineCoreRequest,
-    ):
-        """Add new request to the Detokenizer."""
-
-        assert (request.request_id not in self.request_states)
-
-        request_state = IncrementalDetokenizer.from_new_request(
-            self.tokenizer, request)
-        self.request_states[request.request_id] = request_state
-
-    def step(
-        self, encore_core_outputs: List[EngineCoreOutput]
-    ) -> OutputProcessorOutput:
-        """Update state and request the RequestOutputs to the LLMEngine."""
+    def process_outputs(self,
+                        outputs: EngineCoreOutputs) -> OutputProcessorOutput:
+        """
+        Process the EngineCoreOutputs:
+        1) Compute stats for logging
+        2) Detokenize
+        3) Create and handle RequestOutput objects:
+            * If self.stream_outputs (for usage with AsyncLLM), 
+              we put RequestOutput objects into the asyncio queue
+              for handling by the per-request generate() tasks.
+            * If not self.stream_outputs (for usage with LLMEngine), 
+              we return a list of RequestOutput objects.
+
+        ****************** NOTE FOR DEVELOPERS ******************
+
+        VLLM V1 minimizes the number of python loops over the full
+        batch to ensure system overheads are minimized. This is the 
+        only function that should loop over EngineCoreOutputs.
+
+        If you need to touch every element of the batch, implement a
+        method called XXXClass.update_from_output() to be called
+        within the loop below. For examples, see:
+            * IterationStats.update_from_output()
+            * Detokenizer.update_from_output()
+        
+        **********************************************************
+        """
 
-        iteraton_stats = IterationStats()
         request_outputs: List[RequestOutput] = []
-        requests_to_abort: List[str] = []
-        for engine_core_output in encore_core_outputs:
-            request_id = engine_core_output.request_id
-            detokenizer = self.request_states.get(request_id)
-            if detokenizer is None:
+        reqs_to_abort: List[str] = []
+        iteration_stats = IterationStats(self.log_stats)
+        for engine_core_output in outputs.outputs:
+            req_id = engine_core_output.request_id
+            req_state = self.request_states.get(req_id)
+            if req_state is None:
                 # Ignore output for already-aborted request.
                 continue
 
-            # Detokenize and update state.
-            request_output = detokenizer.add_tokens(
-                new_token_ids=engine_core_output.new_token_ids,
-                finish_reason=engine_core_output.finish_reason,
-                stop_reason=engine_core_output.stop_reason,
-            )
-
-            if request_output is not None:
-                # Add to RequestOutputs list.
-                request_outputs.append(request_output)
+            # 1) Compute stats for this iteration.
+            iteration_stats.update_from_output(engine_core_output,
+                                               req_state.is_prefilling,
+                                               req_state.prompt_len)
+            req_state.is_prefilling = False
+
+            # 2) Detokenize the token ids into text.
+            detokenizer_output = req_state.detokenizer.update_from_output(
+                engine_core_output)
+
+            # 3) Create and handle RequestOutput objects.
+            if request_output := self.make_request_output(
+                    req_state, detokenizer_output):
+                if req_state.queue is not None:
+                    # AsyncLLM: put into queue for handling by generate().
+                    req_state.queue.put_nowait(request_output)
+                else:
+                    # LLMEngine: return list of RequestOutputs.
+                    request_outputs.append(request_output)
 
                 # Free completed requests.
                 if request_output.finished:
-                    self.request_states.pop(request_id)
+                    self.request_states.pop(req_id)
                     if not engine_core_output.finished:
-                        requests_to_abort.append(request_id)
+                        # If req not finished in EngineCore, but Detokenizer
+                        # detected stop string, abort needed in EngineCore.
+                        reqs_to_abort.append(req_id)
 
-        # Return to EngineClient.
         return OutputProcessorOutput(
             request_outputs=request_outputs,
-            requests_to_abort=requests_to_abort,
-            iteration_stats=iteraton_stats,
+            reqs_to_abort=reqs_to_abort,
+            iteration_stats=iteration_stats,
         )
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 43419d2ff5381..cc5980a6b96b5 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -25,19 +25,18 @@ def __init__(
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
-        tokenizer: BaseTokenizerGroup,
+        tokenizer_group: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
         self.model_config = model_config
         self.lora_config = lora_config
-        self.tokenizer = tokenizer
 
         self.generation_config_fields = model_config.try_get_generation_config(
         )
         self.input_preprocessor = InputPreprocessor(model_config,
-                                                    self.tokenizer,
+                                                    tokenizer_group,
                                                     mm_registry)
         self.input_processor = input_registry.create_input_processor(
             model_config)
diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py
new file mode 100644
index 0000000000000..ed808ac456624
--- /dev/null
+++ b/vllm/v1/engine/request_state.py
@@ -0,0 +1,46 @@
+from typing import List, Optional
+
+import asyncio
+
+from vllm.outputs import RequestOutput
+from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import Detokenizer
+
+
+class RequestState:
+    """RequestState for AsyncLLM and LLMEngine."""
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        detokenizer: Detokenizer,
+        queue: Optional[asyncio.Queue[RequestOutput]],
+    ):
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_len = len(prompt_token_ids)
+        self.detokenizer = detokenizer
+        self.is_prefilling = True
+        self.queue = queue
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> "RequestState":
+        return cls(
+            request_id=request.request_id,
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            detokenizer=Detokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
+            queue=queue,
+        )
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index d9c7aa400005a..383aa200944eb 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,5 +1,7 @@
 from dataclasses import dataclass
 
+from vllm.v1.engine import EngineCoreOutput
+
 
 @dataclass
 class SchedulerStats:
@@ -12,9 +14,23 @@ class SchedulerStats:
     # gpu_prefix_cache_hit_rate: float = 0.0
 
 
-@dataclass
 class IterationStats:
-    """Stats associated with a single iteration"""
+    """Stats associated with a single set of EngineCoreOutputs."""
+
+    def __init__(self, log_stats: bool):
+        self.log_stats = log_stats
+        self.num_generation_tokens = 0
+        self.num_prompt_tokens = 0
+
+    def update_from_output(self,
+                           output: EngineCoreOutput,
+                           is_prefilling: bool,
+                           prompt_len: int = 0):
+        """Update the IterationStats with the EngineCoreOutput."""
+
+        if not self.log_stats:
+            return
 
-    num_generation_tokens: int = 0
-    num_prompt_tokens: int = 0
+        self.num_generation_tokens += len(output.new_token_ids)
+        if is_prefilling:
+            self.num_prompt_tokens += prompt_len

From 86d33a16717f3f2261e1d947ed33f1e5568c8692 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 18:58:11 +0000
Subject: [PATCH 13/67] stash

---
 vllm/v1/engine/async_llm.py | 2 +-
 vllm/v1/metrics/stats.py    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d7068d1262843..5cf8b3a62eb91 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -151,7 +151,7 @@ async def add_request(
         # 2) Make a nnew RequestState and queue.
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
         self.request_states[request_id] = RequestState.from_new_request(
-            tokenizer=self.get_tokenizer(),
+            tokenizer=(await self.get_tokenizer()),
             request=request,
             queue=queue,
         )
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 383aa200944eb..5deb50f4795bb 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,6 +1,8 @@
+from typing import TYPE_CHECKING
 from dataclasses import dataclass
 
-from vllm.v1.engine import EngineCoreOutput
+if TYPE_CHECKING:
+    from vllm.v1.engine import EngineCoreOutput
 
 
 @dataclass
@@ -23,7 +25,7 @@ def __init__(self, log_stats: bool):
         self.num_prompt_tokens = 0
 
     def update_from_output(self,
-                           output: EngineCoreOutput,
+                           output: "EngineCoreOutput",
                            is_prefilling: bool,
                            prompt_len: int = 0):
         """Update the IterationStats with the EngineCoreOutput."""

From 4066fc84a4c751d6d7ff4a2455399e12fb3fd53a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 19:07:45 +0000
Subject: [PATCH 14/67] working again

---
 tests/v1/engine/test_engine_core.py        | 4 ++--
 tests/v1/engine/test_engine_core_client.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 5b1732036e807..cccfd305ac604 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -80,7 +80,7 @@ def test_engine_core(monkeypatch):
         assert len(engine_core.scheduler.running) == 4
 
         # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
+        while len(engine_core.step().outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
@@ -170,7 +170,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
         # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
+        while len(engine_core.step().outputs) > 0:
             pass
 
         assert len(engine_core.scheduler.waiting) == 0
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 7eac16f2cf542..e2c728b22d481 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -43,7 +43,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
 def loop_until_done(client: EngineCoreClient, outputs: Dict):
 
     while True:
-        engine_core_outputs = client.get_output()
+        engine_core_outputs = client.get_output().outputs
 
         if len(engine_core_outputs) == 0:
             break
@@ -61,7 +61,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
 async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
 
     while True:
-        engine_core_outputs = await client.get_output_async()
+        engine_core_outputs = await client.get_output_async().outputs
 
         if len(engine_core_outputs) == 0:
             break

From c9ffc60d2e051eecc13f0ab639ee64cddc316991 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 19:15:37 +0000
Subject: [PATCH 15/67] fixed sorting

---
 vllm/v1/engine/detokenizer.py   | 1 -
 vllm/v1/engine/request_state.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 7f6c8036f7890..c93795974e347 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,6 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 
-
 logger = init_logger(__name__)
 
 @dataclass
diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py
index ed808ac456624..572b6d977f350 100644
--- a/vllm/v1/engine/request_state.py
+++ b/vllm/v1/engine/request_state.py
@@ -1,6 +1,5 @@
-from typing import List, Optional
-
 import asyncio
+from typing import List, Optional
 
 from vllm.outputs import RequestOutput
 from vllm.transformers_utils.detokenizer_utils import AnyTokenizer

From e34b9dc2e0910472f3c46153efd8b81104d0c879 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:14:08 +0000
Subject: [PATCH 16/67] merged

---
 tests/v1/engine/test_async_llm.py | 61 +++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index fffb5b8100ec7..a9fa8152ed87d 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -13,21 +13,38 @@
                 allow_module_level=True)
 
 ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+                              enforce_eager=True,
                               disable_log_requests=True)
 
 
-async def generate(engine: AsyncLLM, request_id: str,
-                   max_tokens: int) -> Tuple[int, str]:
-    count = 0
-    async for _ in engine.generate(request_id=request_id,
-                                   prompt="Hello my name is Robert and",
-                                   sampling_params=SamplingParams(
-                                       max_tokens=max_tokens, temperature=0)):
-
-        count += 1
-        await asyncio.sleep(0.)
+async def run_example(
+    engine: AsyncLLM,
+    request_id: str,
+    num_tokens: int,
+    abort_after: int = 0
+) -> Tuple[int, int, str]:
+    
+    generator = engine.generate(
+        request_id=request_id,
+        prompt="Hello my name is Robert and",
+        sampling_params=SamplingParams(max_tokens=num_tokens, temperature=0))
 
-    return count, request_id
+    count = 0
+    try:
+        async for _ in generator():
+            count += 1
+            print(f"{request_id=}, {count=}, {abort_after=}")
+            if count == abort_after:
+                # Simulate request cancellation.
+                print(f"{request_id=}")
+                asyncio.current_task().cancel()
+    except asyncio.CancelledError:
+        print(f"{request_id=}")
+        assert request_id not in engine.request_states
+    finally:
+        
+        expected_count = num_tokens if abort_after == 0 else abort_after
+        return count, expected_count, request_id
 
 
 @pytest.mark.asyncio
@@ -40,24 +57,30 @@ async def test_load(monkeypatch):
 
         engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
 
-        NUM_REQUESTS = 10000
+        NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 10
+        # Abort 1/100 requests after 5 tokens.
+        ABORT_RATE = 100
+        ABORT_AFTER = 5
 
         request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
 
         # Create concurrent requests.
-        tasks = []
-        for request_id in request_ids:
-            tasks.append(
-                asyncio.create_task(
-                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+        tasks = [
+            asyncio.create_task(run_example(
+                engine=engine,
+                request_id=request_id,
+                num_tokens=NUM_EXPECTED_TOKENS,
+                abort_after=(ABORT_AFTER if idx % ABORT_RATE == 0 else 0)
+            )) for idx, request_id in enumerate(request_ids)
+        ]
 
         # Confirm that we got all the EXPECTED tokens from the requests.
         failed_request_id = None
         tokens = None
         for task in tasks:
-            num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
+            num_generated_tokens, expected_tokens, request_id = await task
+            if (num_generated_tokens != expected_tokens
                     and failed_request_id is None):
                 failed_request_id = request_id
                 tokens = num_generated_tokens

From dd6e3d60fed08dc19b547dfd0cf3a1574ceb6b84 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:18:07 +0000
Subject: [PATCH 17/67] reduce number of changes

---
 vllm/v1/engine/processor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index cc5980a6b96b5..43419d2ff5381 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -25,18 +25,19 @@ def __init__(
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
-        tokenizer_group: BaseTokenizerGroup,
+        tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
         self.model_config = model_config
         self.lora_config = lora_config
+        self.tokenizer = tokenizer
 
         self.generation_config_fields = model_config.try_get_generation_config(
         )
         self.input_preprocessor = InputPreprocessor(model_config,
-                                                    tokenizer_group,
+                                                    self.tokenizer,
                                                     mm_registry)
         self.input_processor = input_registry.create_input_processor(
             model_config)

From dbd86b8591b4ea7d468c5b6ebe5f2383d9f33f38 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:18:42 +0000
Subject: [PATCH 18/67] reduce changes

---
 vllm/v1/metrics/loggers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 4ed9193128647..6b2fd02b8aed2 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -17,6 +17,7 @@ def log(self, scheduler_stats: SchedulerStats):
 
 
 class LoggingStatLogger(StatLoggerBase):
+    """Log Stats to standard output."""
 
     def __init__(self):
         self.last_log_time = time.monotonic()

From ebf3530c82736d5d6f570614a9345bf78ddd0549 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:19:31 +0000
Subject: [PATCH 19/67] reduce changes

---
 vllm/v1/engine/async_llm.py  | 2 +-
 vllm/v1/engine/llm_engine.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a78c67a70bbc9..da0bd252b5263 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -72,7 +72,7 @@ def __init__(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
-            tokenizer_group=self.tokenizer_group,
+            tokenizer=self.tokenizer_group,
             input_registry=input_registry,
         )
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 5a6f75ea70f2a..24094b43a984d 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -60,7 +60,7 @@ def __init__(
         self.processor = Processor(model_config=vllm_config.model_config,
                                    cache_config=vllm_config.cache_config,
                                    lora_config=vllm_config.lora_config,
-                                   tokenizer_group=self.tokenizer_group,
+                                   tokenizer=self.tokenizer_group,
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
 

From 7b6d9b3cbe3987899edfaa6f1de02564d1e0fc0e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:20:18 +0000
Subject: [PATCH 20/67] updared

---
 vllm/v1/metrics/loggers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6b2fd02b8aed2..8feeef17542e6 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -17,12 +17,12 @@ def log(self, scheduler_stats: SchedulerStats):
 
 
 class LoggingStatLogger(StatLoggerBase):
-    """Log Stats to standard output."""
 
     def __init__(self):
         self.last_log_time = time.monotonic()
 
     def log(self, scheduler_stats: SchedulerStats):
+        """Log Stats to standard output."""
 
         # Log every _LOCAL_LOGGING_INTERVAL_SEC.
         now = time.monotonic()

From 707796f11c865214855681dca0f2531e86cd8598 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:22:59 +0000
Subject: [PATCH 21/67] make pr more reviewable

---
 vllm/v1/engine/llm_engine.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 24094b43a984d..4c272b1b483ab 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -46,12 +46,12 @@ def __init__(
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
-        self.tokenizer_group = init_tokenizer_from_configs(
+        self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer_group.ping()
+        self.tokenizer.ping()
 
         # Request States (map of request_id -> RequestState).
         self.request_states: Dict[str, RequestState] = {}
@@ -60,7 +60,7 @@ def __init__(
         self.processor = Processor(model_config=vllm_config.model_config,
                                    cache_config=vllm_config.cache_config,
                                    lora_config=vllm_config.lora_config,
-                                   tokenizer=self.tokenizer_group,
+                                   tokenizer=self.tokenizer,
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
 
@@ -138,7 +138,7 @@ def add_request(
 
         # 2) Make a new RequestState and queue.
         self.request_states[request_id] = RequestState.from_new_request(
-            tokenizer=self.get_tokenizer(),
+            tokenizer=self.get_tokenizer_group().get_lora_tokenizer(lora_request),
             request=request,
         )
 
@@ -173,7 +173,7 @@ def get_tokenizer_group(
         self,
         group_type: Type[_G] = BaseTokenizerGroup,
     ) -> _G:
-        tokenizer_group = self.tokenizer_group
+        tokenizer_group = self.tokenizer
 
         if tokenizer_group is None:
             raise ValueError("Unable to get tokenizer because "
@@ -184,9 +184,3 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
-
-    async def get_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)

From df72c8f320618156aa8c716dc40ebbc1a0017368 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:24:51 +0000
Subject: [PATCH 22/67] update comments

---
 vllm/v1/engine/output_processor.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 755e2790805e2..5725a19b8dbc9 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -58,11 +58,12 @@ def process_outputs(self,
         1) Compute stats for logging
         2) Detokenize
         3) Create and handle RequestOutput objects:
-            * If self.stream_outputs (for usage with AsyncLLM), 
-              we put RequestOutput objects into the asyncio queue
-              for handling by the per-request generate() tasks.
-            * If not self.stream_outputs (for usage with LLMEngine), 
-              we return a list of RequestOutput objects.
+            * If there is a queue (for usage with AsyncLLM), 
+              put the RequestOutput objects into the queue for
+              handling by the per-request generate() tasks.
+
+            * If there is no queue (for usage with LLMEngine), 
+              return a list of RequestOutput objects.
 
         ****************** NOTE FOR DEVELOPERS ******************
 

From 9d67efc40e45322cef9d9bd8d4a3fd001e689c25 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:27:27 +0000
Subject: [PATCH 23/67] make PR more readable

---
 vllm/v1/engine/async_llm.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index da0bd252b5263..7ca22f6c6c7a0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -57,12 +57,12 @@ def __init__(
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).
-        self.tokenizer_group = init_tokenizer_from_configs(
+        self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
             parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer_group.ping()
+        self.tokenizer.ping()
 
         # Request States (map of request_id -> RequestState).
         self.request_states: Dict[str, RequestState] = {}
@@ -72,7 +72,7 @@ def __init__(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
-            tokenizer=self.tokenizer_group,
+            tokenizer=self.tokenizer,
             input_registry=input_registry,
         )
 
@@ -155,7 +155,7 @@ async def add_request(
         # 2) Make a nnew RequestState and queue.
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
         self.request_states[request_id] = RequestState.from_new_request(
-            tokenizer=(await self.get_tokenizer()),
+            tokenizer=(await self.get_tokenizer(lora_request)),
             request=request,
             queue=queue,
         )
@@ -265,14 +265,13 @@ async def step_async(self):
             kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
-        """Abort a Request."""
+        """Abort RequestId in AsyncLLM and EngineCore."""
 
-        # Remove from EngineCore.
-        await self.engine_core.abort_requests_async([request_id])
+        request_ids = [request_id]
+        await self.engine_core.abort_requests_async(request_ids)
 
-        # Remove from AsyncLLM.
-        # Note: the request can finish during await, so check to make
-        # sure it is still active in the tracker before we pop.
+        # If a request finishes while we await then the request_id
+        # will be removed from the tracked queues before we get here.
         _ = self.request_states.pop(request_id, None)
 
     def _log_stats(
@@ -310,7 +309,7 @@ async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
     ) -> AnyTokenizer:
-        return self.tokenizer_group.get_lora_tokenizer(lora_request)
+        return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
         return False

From 1cae7836148bea3c35641675ae3d2dc435167e58 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:27:57 +0000
Subject: [PATCH 24/67] reduce cruft

---
 vllm/v1/engine/async_llm.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7ca22f6c6c7a0..27663562ea990 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -50,10 +50,6 @@ def __init__(
             LoggingStatLogger(),
             # TODO(rob): PrometheusStatLogger(),
         ]
-        self.stat_loggers: List[StatLoggerBase] = [
-            LoggingStatLogger(),
-            # TODO(rob): PrometheusStatLogger(),
-        ]
         self.model_config = vllm_config.model_config
 
         # Tokenizer (+ ensure liveness if running in another process).

From 6401cfa0dbcb2998c662537f78a07134e6efbbb0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:28:29 +0000
Subject: [PATCH 25/67] reduce changes

---
 vllm/v1/engine/async_llm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 27663562ea990..6de28bc6d3fbf 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -63,7 +63,7 @@ def __init__(
         # Request States (map of request_id -> RequestState).
         self.request_states: Dict[str, RequestState] = {}
 
-        # Processor (convert Inputs --> EngineCoreRequests).
+        # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
@@ -72,7 +72,7 @@ def __init__(
             input_registry=input_registry,
         )
 
-        # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
+        # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(
             request_states=self.request_states,
             log_stats=self.log_stats,

From 33bc01d02f35265cf80f93c89dc0d6f6d7b01993 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:29:14 +0000
Subject: [PATCH 26/67] reduce changes

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6de28bc6d3fbf..d06455a91df88 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -148,7 +148,7 @@ async def add_request(
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 2) Make a nnew RequestState and queue.
+        # 2) Add the request to AsyncLLM.
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
         self.request_states[request_id] = RequestState.from_new_request(
             tokenizer=(await self.get_tokenizer(lora_request)),

From 7dda30541ee657358eaba52fc40846e610821f87 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:30:00 +0000
Subject: [PATCH 27/67] updated

---
 vllm/v1/engine/detokenizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index c93795974e347..728c306228f87 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -26,7 +26,6 @@ class Detokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
-    # Length of the prompt (in token_ids)
     prompt_len: int
 
     # Stop strings

From 769cff54e1bb1efccc4c3dd83863c8d822054f0c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:30:53 +0000
Subject: [PATCH 28/67] reduce changes

---
 vllm/v1/engine/detokenizer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 728c306228f87..8d9ca1c805ecc 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -111,7 +111,7 @@ def update_from_output(
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
-        for new_token_id in output.new_token_ids:
+        for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
             (new_tokens, new_decoded_token_text, prefix_offset,
              read_offset) = detokenize_incrementally(
@@ -147,6 +147,8 @@ def update_from_output(
                 finish_reason = "stop"  # TODO: use constant
                 stop_reason = stop_str
 
+        # TODO: handle stop_token_ids here too?
+
         # 3) Update the RequestOutput object with the new text.
         finished = bool(finish_reason)
         if self.output_kind == RequestOutputKind.FINAL_ONLY \

From b1b4c47c250f4975117212e8f91557689cfd3ae7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:35:36 +0000
Subject: [PATCH 29/67] minor cleanups

---
 vllm/v1/engine/output_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 5725a19b8dbc9..7cedee4d1c317 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -10,7 +10,6 @@
 
 @dataclass
 class OutputProcessorOutput:
-    """Output of the OutputProcessor.step() function."""
 
     request_outputs: List[RequestOutput]
     reqs_to_abort: List[str]

From 2f916d14554a6af9ae87ecc0461d376c5809f4fe Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:36:48 +0000
Subject: [PATCH 30/67] clean up

---
 vllm/v1/metrics/stats.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 5deb50f4795bb..b1b9445d70aec 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -24,10 +24,12 @@ def __init__(self, log_stats: bool):
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
 
-    def update_from_output(self,
-                           output: "EngineCoreOutput",
-                           is_prefilling: bool,
-                           prompt_len: int = 0):
+    def update_from_output(
+        self,
+        output: "EngineCoreOutput",
+        is_prefilling: bool,
+        prompt_len: int
+    ):
         """Update the IterationStats with the EngineCoreOutput."""
 
         if not self.log_stats:

From 6a5f245312aa55ef40b0f2873e0406de8820ad77 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:37:32 +0000
Subject: [PATCH 31/67] updated

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d06455a91df88..04b050dda03fb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -151,7 +151,7 @@ async def add_request(
         # 2) Add the request to AsyncLLM.
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
         self.request_states[request_id] = RequestState.from_new_request(
-            tokenizer=(await self.get_tokenizer(lora_request)),
+            tokenizer=self.tokenizer.get_lora_tokenizer(lora_request),
             request=request,
             queue=queue,
         )

From 9ea36c89531b53fceec915dd9b2675773dd10d55 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:38:43 +0000
Subject: [PATCH 32/67] updated

---
 vllm/v1/engine/async_llm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 04b050dda03fb..edafedd0159c2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -199,7 +199,8 @@ async def generate(
             # we can call __init__ before the event loop, which enables us
             # to handle startup failure gracefully in the OpenAI server.
             if self.output_handler is None:
-                self.output_handler = asyncio.create_task(self.step_async())
+                self.output_handler = asyncio.create_task(
+                    self._run_output_handler())
 
             q = await self.add_request(
                 request_id,
@@ -231,7 +232,7 @@ async def generate(
             await self.abort(request_id)
             raise
 
-    async def step_async(self):
+    async def _run_output_handler(self):
         """Busy loop: Pull From EngineCore -> Process -> Push to Queues"""
 
         try:

From 318c20399643c1981e4dd7b36ef0701d209fb4dc Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:39:14 +0000
Subject: [PATCH 33/67] reduce changes

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index edafedd0159c2..9c0324d6ff5cc 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -233,7 +233,7 @@ async def generate(
             raise
 
     async def _run_output_handler(self):
-        """Busy loop: Pull From EngineCore -> Process -> Push to Queues"""
+        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
         try:
             while True:

From 374618326080c5dbd2a9f9fa4e5ee97e56006890 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:39:36 +0000
Subject: [PATCH 34/67] reduce LOC changes

---
 vllm/v1/engine/async_llm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 9c0324d6ff5cc..d400fe5856a6a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -226,8 +226,9 @@ async def generate(
 
                 yield out
 
-        # If the request is disconnected by the client, the generate()
-        # task will be canceled. So, we abort the request if we end up here.
+        # If the request is disconnected by the client, the
+        # generate() task will be canceled. So, we abort the
+        # request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
             raise

From 449405b2af39c06679732709429d9f1d3a51f94e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:53:48 +0000
Subject: [PATCH 35/67] updated

---
 vllm/v1/engine/async_llm.py        | 25 ++++------
 vllm/v1/engine/output_processor.py | 77 +++++++++++++++++++++++++-----
 vllm/v1/engine/request_state.py    | 45 -----------------
 3 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d400fe5856a6a..732056ad4342b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -75,6 +75,7 @@ def __init__(
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(
             request_states=self.request_states,
+            tokenizer=self.tokenizer,
             log_stats=self.log_stats,
         )
 
@@ -138,25 +139,20 @@ async def add_request(
     ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        if request_id in self.request_states:
-            raise ValueError(f"Request id {request_id} already running.")
+        # 1) Create a new output queue for the request.
+        queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
 
-        # 1) Convert Input --> Request.
+        # 2) Convert Input --> Request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
                                                 trace_headers,
                                                 prompt_adapter_request,
                                                 priority)
 
-        # 2) Add the request to AsyncLLM.
-        queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
-        self.request_states[request_id] = RequestState.from_new_request(
-            tokenizer=self.tokenizer.get_lora_tokenizer(lora_request),
-            request=request,
-            queue=queue,
-        )
+        # 3) Add the request to OutputProcessor (this process).
+        self.output_processor.add_request(request, queue)
 
-        # 3) Add the EngineCoreRequest to EngineCore (separate process).
+        # 4) Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(request)
 
         if self.log_requests:
@@ -219,7 +215,7 @@ async def generate(
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
-                # Note: OutputProcessor removes from request_states.
+                # Note: OutputProcessor handles removal from request_states.
                 if out.finished:
                     yield out
                     break
@@ -241,10 +237,9 @@ async def _run_output_handler(self):
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 engine_core_outputs = await self.engine_core.get_output_async()
 
-                # 2) Process EngineCoreOutputs, pushing RequestOutputs into
-                # asyncio queues for handling by the per-req generate() task.
+                # 2) Process EngineCoreOutputs.
                 processed_outputs = self.output_processor.process_outputs(
-                    engine_core_outputs)
+                    engine_core_outputs, self.request_states)
 
                 # 3) Abort any reqs that finished due to stop strings.
                 await self.engine_core.abort_requests_async(
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 7cedee4d1c317..fe398cdf66e8d 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,10 +1,12 @@
+import asyncio
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
+from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.outputs import RequestOutput
-from vllm.v1.engine import EngineCoreOutputs
-from vllm.v1.engine.detokenizer import DetokenizerOutput
-from vllm.v1.engine.request_state import RequestState
+from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequest
+from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerOutput
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -16,15 +18,65 @@ class OutputProcessorOutput:
     iteration_stats: IterationStats
 
 
-class OutputProcessor:
+class RequestState:
 
     def __init__(
         self,
-        request_states: Dict[str, RequestState],
-        log_stats: bool,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        detokenizer: Detokenizer,
+        queue: Optional[asyncio.Queue[RequestOutput]],
     ):
-        self.request_states = request_states
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_len = len(prompt_token_ids)
+        self.detokenizer = detokenizer
+        self.is_prefilling = True
+        self.queue = queue
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> "RequestState":
+        return cls(
+            request_id=request.request_id,
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            detokenizer=Detokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
+            queue=queue,
+        )
+
+
+class OutputProcessor:
+
+    def __init__(self, log_stats: bool, tokenizer: BaseTokenizerGroup):
         self.log_stats = log_stats
+        self.tokenizer = tokenizer
+        self.request_states: Dict[str, RequestState] = {}
+    
+    def add_request(
+        self,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> None:
+        request_id = request.request_id
+        if request_id in self.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
+
+        self.request_states[request_id] = RequestState.from_new_request(
+            tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
+            request=request,
+            queue=queue
+        )
+
 
     def make_request_output(
         self,
@@ -50,8 +102,11 @@ def make_request_output(
 
         return request_output
 
-    def process_outputs(self,
-                        outputs: EngineCoreOutputs) -> OutputProcessorOutput:
+    def process_outputs(
+        self,
+        outputs: EngineCoreOutputs,
+        request_states: Dict[str, RequestState],
+    ) -> OutputProcessorOutput:
         """
         Process the EngineCoreOutputs:
         1) Compute stats for logging
@@ -84,7 +139,7 @@ def process_outputs(self,
         iteration_stats = IterationStats(self.log_stats)
         for engine_core_output in outputs.outputs:
             req_id = engine_core_output.request_id
-            req_state = self.request_states.get(req_id)
+            req_state = request_states.get(req_id)
             if req_state is None:
                 # Ignore output for already-aborted request.
                 continue
@@ -111,7 +166,7 @@ def process_outputs(self,
 
                 # Free completed requests.
                 if request_output.finished:
-                    self.request_states.pop(req_id)
+                    request_states.pop(req_id)
                     if not engine_core_output.finished:
                         # If req not finished in EngineCore, but Detokenizer
                         # detected stop string, abort needed in EngineCore.
diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py
index 572b6d977f350..e69de29bb2d1d 100644
--- a/vllm/v1/engine/request_state.py
+++ b/vllm/v1/engine/request_state.py
@@ -1,45 +0,0 @@
-import asyncio
-from typing import List, Optional
-
-from vllm.outputs import RequestOutput
-from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
-from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.detokenizer import Detokenizer
-
-
-class RequestState:
-    """RequestState for AsyncLLM and LLMEngine."""
-
-    def __init__(
-        self,
-        request_id: str,
-        prompt: Optional[str],
-        prompt_token_ids: List[int],
-        detokenizer: Detokenizer,
-        queue: Optional[asyncio.Queue[RequestOutput]],
-    ):
-        self.request_id = request_id
-        self.prompt = prompt
-        self.prompt_token_ids = prompt_token_ids
-        self.prompt_len = len(prompt_token_ids)
-        self.detokenizer = detokenizer
-        self.is_prefilling = True
-        self.queue = queue
-
-    @classmethod
-    def from_new_request(
-        cls,
-        tokenizer: AnyTokenizer,
-        request: EngineCoreRequest,
-        queue: Optional[asyncio.Queue[RequestOutput]] = None,
-    ) -> "RequestState":
-        return cls(
-            request_id=request.request_id,
-            prompt=request.prompt,
-            prompt_token_ids=request.prompt_token_ids,
-            detokenizer=Detokenizer.from_new_request(
-                tokenizer=tokenizer,
-                request=request,
-            ),
-            queue=queue,
-        )

From 79f2f5ff2aacfa1573d30ad2e094f07a24b4aa38 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:54:03 +0000
Subject: [PATCH 36/67] remove file

---
 vllm/v1/engine/request_state.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 vllm/v1/engine/request_state.py

diff --git a/vllm/v1/engine/request_state.py b/vllm/v1/engine/request_state.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From a16d27f1a674ec70320be24f1af39fe3eb235a17 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:55:28 +0000
Subject: [PATCH 37/67] updated

---
 vllm/v1/engine/async_llm.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 732056ad4342b..99ab070333854 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -60,9 +60,6 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request States (map of request_id -> RequestState).
-        self.request_states: Dict[str, RequestState] = {}
-
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
             model_config=vllm_config.model_config,
@@ -74,7 +71,6 @@ def __init__(
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(
-            request_states=self.request_states,
             tokenizer=self.tokenizer,
             log_stats=self.log_stats,
         )
@@ -215,7 +211,8 @@ async def generate(
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
-                # Note: OutputProcessor handles removal from request_states.
+                # Note: both OutputProcessor and EngineCore handle their
+                # own cleanup based on finished.
                 if out.finished:
                     yield out
                     break

From 19372f933770bc22207734bba958e504d7205784 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:55:53 +0000
Subject: [PATCH 38/67] reduce LOC changes

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 99ab070333854..ff305ad85b276 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -212,7 +212,7 @@ async def generate(
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
                 # Note: both OutputProcessor and EngineCore handle their
-                # own cleanup based on finished.
+                # own request cleanup based on finished.
                 if out.finished:
                     yield out
                     break

From 39be5038f9d471c045ddec2f0eb3a2b7c272c1fd Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:57:17 +0000
Subject: [PATCH 39/67] updated

---
 vllm/v1/engine/output_processor.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index fe398cdf66e8d..bfb7d1230a0f5 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -77,6 +77,13 @@ def add_request(
             queue=queue
         )
 
+    def abort_requests(
+        self,
+        request_ids: List[str],
+    ) -> None:
+        for request_id in request_ids:
+            self.request_states.pop(request_id, None)
+        
 
     def make_request_output(
         self,

From 833f028517df84b8530ead4ea728603e9915186b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:58:22 +0000
Subject: [PATCH 40/67] updated

---
 vllm/v1/engine/async_llm.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ff305ad85b276..515bb2e9b81d7 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -255,14 +255,12 @@ async def _run_output_handler(self):
             kill_process_tree(os.getpid())
 
     async def abort(self, request_id: str) -> None:
-        """Abort RequestId in AsyncLLM and EngineCore."""
+        """Abort RequestId in OutputProcessor and EngineCore."""
 
         request_ids = [request_id]
         await self.engine_core.abort_requests_async(request_ids)
+        self.output_processor.abort_requests(request_ids)
 
-        # If a request finishes while we await then the request_id
-        # will be removed from the tracked queues before we get here.
-        _ = self.request_states.pop(request_id, None)
 
     def _log_stats(
         self,

From ef2c3f9e18988f444a88dbd85cb65b3d68ce9b56 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 21:59:41 +0000
Subject: [PATCH 41/67] updated

---
 vllm/v1/engine/async_llm.py        | 9 +++------
 vllm/v1/engine/output_processor.py | 5 ++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 515bb2e9b81d7..c5ea2c66574ee 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -235,19 +235,17 @@ async def _run_output_handler(self):
                 engine_core_outputs = await self.engine_core.get_output_async()
 
                 # 2) Process EngineCoreOutputs.
-                processed_outputs = self.output_processor.process_outputs(
-                    engine_core_outputs, self.request_states)
+                outputs = self.output_processor.process_outputs(engine_core_outputs)
 
                 # 3) Abort any reqs that finished due to stop strings.
-                await self.engine_core.abort_requests_async(
-                    processed_outputs.reqs_to_abort)
+                await self.engine_core.abort_requests_async(outputs.reqs_to_abort)
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once we add Prometheus.
                 self._log_stats(
                     scheduler_stats=engine_core_outputs.scheduler_stats,
-                    iteration_stats=processed_outputs.iteration_stats,
+                    iteration_stats=outputs.iteration_stats,
                 )
 
         except Exception as e:
@@ -261,7 +259,6 @@ async def abort(self, request_id: str) -> None:
         await self.engine_core.abort_requests_async(request_ids)
         self.output_processor.abort_requests(request_ids)
 
-
     def _log_stats(
         self,
         scheduler_stats: SchedulerStats,
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index bfb7d1230a0f5..16ffd1a937b55 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -112,7 +112,6 @@ def make_request_output(
     def process_outputs(
         self,
         outputs: EngineCoreOutputs,
-        request_states: Dict[str, RequestState],
     ) -> OutputProcessorOutput:
         """
         Process the EngineCoreOutputs:
@@ -146,7 +145,7 @@ def process_outputs(
         iteration_stats = IterationStats(self.log_stats)
         for engine_core_output in outputs.outputs:
             req_id = engine_core_output.request_id
-            req_state = request_states.get(req_id)
+            req_state = self.request_states.get(req_id)
             if req_state is None:
                 # Ignore output for already-aborted request.
                 continue
@@ -173,7 +172,7 @@ def process_outputs(
 
                 # Free completed requests.
                 if request_output.finished:
-                    request_states.pop(req_id)
+                    self.request_states.pop(req_id)
                     if not engine_core_output.finished:
                         # If req not finished in EngineCore, but Detokenizer
                         # detected stop string, abort needed in EngineCore.

From 33303fc298a028c8ad6551eed3bfe800978f59aa Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:01:57 +0000
Subject: [PATCH 42/67] updated

---
 vllm/v1/engine/async_llm.py        | 11 ++++++-----
 vllm/v1/engine/output_processor.py |  6 +++---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c5ea2c66574ee..88b42e76a844b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -232,20 +232,21 @@ async def _run_output_handler(self):
         try:
             while True:
                 # 1) Pull EngineCoreOutputs from the EngineCore.
-                engine_core_outputs = await self.engine_core.get_output_async()
+                outputs = await self.engine_core.get_output_async()
 
                 # 2) Process EngineCoreOutputs.
-                outputs = self.output_processor.process_outputs(engine_core_outputs)
+                processed_outputs = self.output_processor.process_outputs(
+                    outputs.outputs)
 
                 # 3) Abort any reqs that finished due to stop strings.
-                await self.engine_core.abort_requests_async(outputs.reqs_to_abort)
+                await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort)
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once we add Prometheus.
                 self._log_stats(
-                    scheduler_stats=engine_core_outputs.scheduler_stats,
-                    iteration_stats=outputs.iteration_stats,
+                    scheduler_stats=outputs.scheduler_stats,
+                    iteration_stats=processed_outputs.iteration_stats,
                 )
 
         except Exception as e:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 16ffd1a937b55..0ea79f412981f 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -5,7 +5,7 @@
 from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.outputs import RequestOutput
-from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequest
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerOutput
 from vllm.v1.metrics.stats import IterationStats
 
@@ -111,7 +111,7 @@ def make_request_output(
 
     def process_outputs(
         self,
-        outputs: EngineCoreOutputs,
+        engine_core_outputs: List[EngineCoreOutput],
     ) -> OutputProcessorOutput:
         """
         Process the EngineCoreOutputs:
@@ -143,7 +143,7 @@ def process_outputs(
         request_outputs: List[RequestOutput] = []
         reqs_to_abort: List[str] = []
         iteration_stats = IterationStats(self.log_stats)
-        for engine_core_output in outputs.outputs:
+        for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
             req_state = self.request_states.get(req_id)
             if req_state is None:

From edae5d2c01b1b396ab6c7f0b8e17db179acb5fc3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:07:06 +0000
Subject: [PATCH 43/67] updated

---
 tests/v1/engine/test_async_llm.py  | 35 +++++++++++++++---------------
 vllm/v1/engine/async_llm.py        | 10 ++++-----
 vllm/v1/engine/detokenizer.py      |  6 ++---
 vllm/v1/engine/llm_engine.py       | 15 ++++---------
 vllm/v1/engine/output_processor.py |  6 ++---
 vllm/v1/metrics/stats.py           | 10 ++-------
 6 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index a9fa8152ed87d..ab04731e3bc1b 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -17,17 +17,15 @@
                               disable_log_requests=True)
 
 
-async def run_example(
-    engine: AsyncLLM,
-    request_id: str,
-    num_tokens: int,
-    abort_after: int = 0
-) -> Tuple[int, int, str]:
-    
-    generator = engine.generate(
-        request_id=request_id,
-        prompt="Hello my name is Robert and",
-        sampling_params=SamplingParams(max_tokens=num_tokens, temperature=0))
+async def run_example(engine: AsyncLLM,
+                      request_id: str,
+                      num_tokens: int,
+                      abort_after: int = 0) -> Tuple[int, int, str]:
+
+    generator = engine.generate(request_id=request_id,
+                                prompt="Hello my name is Robert and",
+                                sampling_params=SamplingParams(
+                                    max_tokens=num_tokens, temperature=0))
 
     count = 0
     try:
@@ -42,7 +40,7 @@ async def run_example(
         print(f"{request_id=}")
         assert request_id not in engine.request_states
     finally:
-        
+
         expected_count = num_tokens if abort_after == 0 else abort_after
         return count, expected_count, request_id
 
@@ -67,12 +65,13 @@ async def test_load(monkeypatch):
 
         # Create concurrent requests.
         tasks = [
-            asyncio.create_task(run_example(
-                engine=engine,
-                request_id=request_id,
-                num_tokens=NUM_EXPECTED_TOKENS,
-                abort_after=(ABORT_AFTER if idx % ABORT_RATE == 0 else 0)
-            )) for idx, request_id in enumerate(request_ids)
+            asyncio.create_task(
+                run_example(engine=engine,
+                            request_id=request_id,
+                            num_tokens=NUM_EXPECTED_TOKENS,
+                            abort_after=(ABORT_AFTER if idx %
+                                         ABORT_RATE == 0 else 0)))
+            for idx, request_id in enumerate(request_ids)
         ]
 
         # Confirm that we got all the EXPECTED tokens from the requests.
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 88b42e76a844b..39306163ad18f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -20,7 +20,6 @@
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
-from vllm.v1.engine.request_state import RequestState
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
@@ -70,10 +69,8 @@ def __init__(
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
-        self.output_processor = OutputProcessor(
-            tokenizer=self.tokenizer,
-            log_stats=self.log_stats,
-        )
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_client(
@@ -239,7 +236,8 @@ async def _run_output_handler(self):
                     outputs.outputs)
 
                 # 3) Abort any reqs that finished due to stop strings.
-                await self.engine_core.abort_requests_async(processed_outputs.reqs_to_abort)
+                await self.engine_core.abort_requests_async(
+                    processed_outputs.reqs_to_abort)
 
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 8d9ca1c805ecc..2bbbe7cfd89ce 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -10,6 +10,7 @@
 
 logger = init_logger(__name__)
 
+
 @dataclass
 class DetokenizerOutput:
     output_text: str
@@ -159,9 +160,8 @@ def update_from_output(
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
 
-        return DetokenizerOutput(
-            output_text, token_ids, finished, finish_reason, stop_reason)
-
+        return DetokenizerOutput(output_text, token_ids, finished,
+                                 finish_reason, stop_reason)
 
     def _get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4c272b1b483ab..145439067253e 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -53,9 +53,6 @@ def __init__(
             lora_config=vllm_config.lora_config)
         self.tokenizer.ping()
 
-        # Request States (map of request_id -> RequestState).
-        self.request_states: Dict[str, RequestState] = {}
-
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(model_config=vllm_config.model_config,
                                    cache_config=vllm_config.cache_config,
@@ -65,10 +62,8 @@ def __init__(
                                    mm_registry=mm_registry)
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
-        self.output_processor = OutputProcessor(
-            request_states=self.request_states,
-            log_stats=False,
-        )
+        self.output_processor = OutputProcessor(self.tokenizer,
+                                                log_stats=False)
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
@@ -126,9 +121,6 @@ def add_request(
         priority: int = 0,
     ) -> None:
 
-        if request_id in self.request_states:
-            raise ValueError(f"Request id {request_id} already running.")
-
         # 1) Convert Input --> Request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
@@ -138,7 +130,8 @@ def add_request(
 
         # 2) Make a new RequestState and queue.
         self.request_states[request_id] = RequestState.from_new_request(
-            tokenizer=self.get_tokenizer_group().get_lora_tokenizer(lora_request),
+            tokenizer=self.get_tokenizer_group().get_lora_tokenizer(
+                lora_request),
             request=request,
         )
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 0ea79f412981f..fef45138bf951 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -61,7 +61,7 @@ def __init__(self, log_stats: bool, tokenizer: BaseTokenizerGroup):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: Dict[str, RequestState] = {}
-    
+
     def add_request(
         self,
         request: EngineCoreRequest,
@@ -74,8 +74,7 @@ def add_request(
         self.request_states[request_id] = RequestState.from_new_request(
             tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
             request=request,
-            queue=queue
-        )
+            queue=queue)
 
     def abort_requests(
         self,
@@ -83,7 +82,6 @@ def abort_requests(
     ) -> None:
         for request_id in request_ids:
             self.request_states.pop(request_id, None)
-        
 
     def make_request_output(
         self,
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index b1b9445d70aec..062f419f806bf 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -24,14 +24,8 @@ def __init__(self, log_stats: bool):
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
 
-    def update_from_output(
-        self,
-        output: "EngineCoreOutput",
-        is_prefilling: bool,
-        prompt_len: int
-    ):
-        """Update the IterationStats with the EngineCoreOutput."""
-
+    def update_from_output(self, output: "EngineCoreOutput",
+                           is_prefilling: bool, prompt_len: int):
         if not self.log_stats:
             return
 

From a20c7b50c1b6859efdbd18a927698e6526da858b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:08:37 +0000
Subject: [PATCH 44/67] updated

---
 vllm/v1/engine/llm_engine.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 145439067253e..3a0265e254dcf 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -109,6 +109,12 @@ def has_unfinished_requests(self) -> bool:
     def validate_outputs(cls, outputs, output_type):
         return outputs
 
+    def abort_request(self, request_ids: List[str]) -> None:	
+        """Remove request_ids from EngineCore and Detokenizer."""	
+
+        self.engine_core.abort_requests(request_ids)	
+        self.output_processor.abort_requests(request_ids)
+
     def add_request(
         self,
         request_id: str,
@@ -121,7 +127,7 @@ def add_request(
         priority: int = 0,
     ) -> None:
 
-        # 1) Convert Input --> Request.
+        # 1) Process raw inputs into the request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
                                                 trace_headers,
@@ -129,11 +135,7 @@ def add_request(
                                                 priority)
 
         # 2) Make a new RequestState and queue.
-        self.request_states[request_id] = RequestState.from_new_request(
-            tokenizer=self.get_tokenizer_group().get_lora_tokenizer(
-                lora_request),
-            request=request,
-        )
+        self.output_processor.add_request(request)
 
         # 3) Add the request to EngineCore.
         self.engine_core.add_request(request)

From b7e5a91495aa79a76b7feec453af6d449b91559a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:09:17 +0000
Subject: [PATCH 45/67] updated

---
 vllm/v1/engine/llm_engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 3a0265e254dcf..de2b2599281cb 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -109,10 +109,10 @@ def has_unfinished_requests(self) -> bool:
     def validate_outputs(cls, outputs, output_type):
         return outputs
 
-    def abort_request(self, request_ids: List[str]) -> None:	
-        """Remove request_ids from EngineCore and Detokenizer."""	
+    def abort_request(self, request_ids: List[str]) -> None:
+        """Remove request_ids from EngineCore and Detokenizer."""
 
-        self.engine_core.abort_requests(request_ids)	
+        self.engine_core.abort_requests(request_ids)
         self.output_processor.abort_requests(request_ids)
 
     def add_request(

From 93530109539327b7f4e4705698b639c80913c5e4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:12:33 +0000
Subject: [PATCH 46/67] fixed

---
 vllm/v1/engine/llm_engine.py       |  6 +--
 vllm/v1/engine/output_processor.py | 77 +++++++++++++++++-------------
 2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index de2b2599281cb..efa249d6c512b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -100,10 +100,10 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     def get_num_unfinished_requests(self) -> int:
-        return len(self.request_states)
+        return self.output_processor.get_num_unfinished_requests()
 
     def has_unfinished_requests(self) -> bool:
-        return self.get_num_unfinished_requests() > 0
+        return self.output_processor.has_unfinished_requests()
 
     @classmethod
     def validate_outputs(cls, outputs, output_type):
@@ -148,7 +148,7 @@ def step(self) -> List[RequestOutput]:
 
         # 2) Process EngineCoreOutputs.
         processed_outputs = self.output_processor.process_outputs(
-            engine_core_outputs)
+            engine_core_outputs.output)
 
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index fef45138bf951..7d1942554db77 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -56,12 +56,30 @@ def from_new_request(
 
 
 class OutputProcessor:
+    """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(self, log_stats: bool, tokenizer: BaseTokenizerGroup):
+    def __init__(
+        self,
+        tokenizer: BaseTokenizerGroup,
+        log_stats: bool,
+    ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: Dict[str, RequestState] = {}
 
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def abort_requests(
+        self,
+        request_ids: List[str],
+    ) -> None:
+        for request_id in request_ids:
+            self.request_states.pop(request_id, None)
+
     def add_request(
         self,
         request: EngineCoreRequest,
@@ -76,37 +94,6 @@ def add_request(
             request=request,
             queue=queue)
 
-    def abort_requests(
-        self,
-        request_ids: List[str],
-    ) -> None:
-        for request_id in request_ids:
-            self.request_states.pop(request_id, None)
-
-    def make_request_output(
-        self,
-        request_state: RequestState,
-        detokenizer_output: Optional[DetokenizerOutput],
-    ) -> Optional[RequestOutput]:
-
-        if detokenizer_output is None:
-            return None
-
-        request_output = RequestOutput.new(
-            request_state.request_id,
-            request_state.prompt,
-            request_state.prompt_token_ids,
-            detokenizer_output.output_text,
-            detokenizer_output.token_ids,
-            detokenizer_output.finished,
-        )
-        if detokenizer_output.finished:
-            completion_output = request_output.outputs[0]
-            completion_output.finish_reason = detokenizer_output.finish_reason
-            completion_output.stop_reason = detokenizer_output.stop_reason
-
-        return request_output
-
     def process_outputs(
         self,
         engine_core_outputs: List[EngineCoreOutput],
@@ -159,7 +146,7 @@ def process_outputs(
                 engine_core_output)
 
             # 3) Create and handle RequestOutput objects.
-            if request_output := self.make_request_output(
+            if request_output := self._make_request_output(
                     req_state, detokenizer_output):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
@@ -181,3 +168,27 @@ def process_outputs(
             reqs_to_abort=reqs_to_abort,
             iteration_stats=iteration_stats,
         )
+
+    def _make_request_output(
+        self,
+        request_state: RequestState,
+        detokenizer_output: Optional[DetokenizerOutput],
+    ) -> Optional[RequestOutput]:
+
+        if detokenizer_output is None:
+            return None
+
+        request_output = RequestOutput.new(
+            request_state.request_id,
+            request_state.prompt,
+            request_state.prompt_token_ids,
+            detokenizer_output.output_text,
+            detokenizer_output.token_ids,
+            detokenizer_output.finished,
+        )
+        if detokenizer_output.finished:
+            completion_output = request_output.outputs[0]
+            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.stop_reason = detokenizer_output.stop_reason
+
+        return request_output

From 94de9f52102e4b78910f805e17b2c348c3c5e878 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:13:39 +0000
Subject: [PATCH 47/67] cleanup

---
 vllm/v1/engine/llm_engine.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index efa249d6c512b..85f2cef4f34d9 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -141,14 +141,13 @@ def add_request(
         self.engine_core.add_request(request)
 
     def step(self) -> List[RequestOutput]:
-        """Pull From EngineCore -> Process -> Return RequestOutput."""
 
-        # 1) Pull EngineCoreOutput from the EngineCore.
-        engine_core_outputs = self.engine_core.get_output()
+        # 1) Get EngineCoreOutput from the EngineCore.
+        outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
         processed_outputs = self.output_processor.process_outputs(
-            engine_core_outputs.output)
+            outputs.output)
 
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)

From 2ea4283f93b43741b097a316409263780c5198ce Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:19:57 +0000
Subject: [PATCH 48/67] revert abort test

---
 tests/v1/engine/test_async_llm.py  | 58 ++++++++++--------------------
 vllm/v1/engine/async_llm.py        |  4 ++-
 vllm/v1/engine/detokenizer.py      |  4 +--
 vllm/v1/engine/llm_engine.py       |  5 ++-
 vllm/v1/engine/output_processor.py |  6 ++--
 5 files changed, 28 insertions(+), 49 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index ab04731e3bc1b..fffb5b8100ec7 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -13,36 +13,21 @@
                 allow_module_level=True)
 
 ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
-                              enforce_eager=True,
                               disable_log_requests=True)
 
 
-async def run_example(engine: AsyncLLM,
-                      request_id: str,
-                      num_tokens: int,
-                      abort_after: int = 0) -> Tuple[int, int, str]:
+async def generate(engine: AsyncLLM, request_id: str,
+                   max_tokens: int) -> Tuple[int, str]:
+    count = 0
+    async for _ in engine.generate(request_id=request_id,
+                                   prompt="Hello my name is Robert and",
+                                   sampling_params=SamplingParams(
+                                       max_tokens=max_tokens, temperature=0)):
 
-    generator = engine.generate(request_id=request_id,
-                                prompt="Hello my name is Robert and",
-                                sampling_params=SamplingParams(
-                                    max_tokens=num_tokens, temperature=0))
+        count += 1
+        await asyncio.sleep(0.)
 
-    count = 0
-    try:
-        async for _ in generator():
-            count += 1
-            print(f"{request_id=}, {count=}, {abort_after=}")
-            if count == abort_after:
-                # Simulate request cancellation.
-                print(f"{request_id=}")
-                asyncio.current_task().cancel()
-    except asyncio.CancelledError:
-        print(f"{request_id=}")
-        assert request_id not in engine.request_states
-    finally:
-
-        expected_count = num_tokens if abort_after == 0 else abort_after
-        return count, expected_count, request_id
+    return count, request_id
 
 
 @pytest.mark.asyncio
@@ -55,31 +40,24 @@ async def test_load(monkeypatch):
 
         engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
 
-        NUM_REQUESTS = 100
+        NUM_REQUESTS = 10000
         NUM_EXPECTED_TOKENS = 10
-        # Abort 1/100 requests after 5 tokens.
-        ABORT_RATE = 100
-        ABORT_AFTER = 5
 
         request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
 
         # Create concurrent requests.
-        tasks = [
-            asyncio.create_task(
-                run_example(engine=engine,
-                            request_id=request_id,
-                            num_tokens=NUM_EXPECTED_TOKENS,
-                            abort_after=(ABORT_AFTER if idx %
-                                         ABORT_RATE == 0 else 0)))
-            for idx, request_id in enumerate(request_ids)
-        ]
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
 
         # Confirm that we got all the EXPECTED tokens from the requests.
         failed_request_id = None
         tokens = None
         for task in tasks:
-            num_generated_tokens, expected_tokens, request_id = await task
-            if (num_generated_tokens != expected_tokens
+            num_generated_tokens, request_id = await task
+            if (num_generated_tokens != NUM_EXPECTED_TOKENS
                     and failed_request_id is None):
                 failed_request_id = request_id
                 tokens = num_generated_tokens
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 39306163ad18f..f1bdce2c5e474 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
+from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -133,6 +133,8 @@ async def add_request(
         """Add new request to the AsyncLLM."""
 
         # 1) Create a new output queue for the request.
+        if request_id in self.output_processor.request_states:
+            raise ValueError(f"Request id {request_id} already running.")
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
 
         # 2) Convert Input --> Request.
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2bbbe7cfd89ce..4a8b61beec037 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -21,7 +21,7 @@ class DetokenizerOutput:
 
 
 @dataclass
-class Detokenizer:
+class IncrementalDetokenizer:
 
     # Generation data
     output_text: str
@@ -58,7 +58,7 @@ def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
-    ) -> "Detokenizer":
+    ) -> "IncrementalDetokenizer":
 
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 85f2cef4f34d9..f5999ccda6447 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -15,12 +15,11 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import (
-    AnyTokenizer, BaseTokenizerGroup, init_tokenizer_from_configs)
+    BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.processor import Processor
-from vllm.v1.engine.request_state import RequestState
 from vllm.v1.executor.abstract import Executor
 
 logger = init_logger(__name__)
@@ -147,7 +146,7 @@ def step(self) -> List[RequestOutput]:
 
         # 2) Process EngineCoreOutputs.
         processed_outputs = self.output_processor.process_outputs(
-            outputs.output)
+            outputs.outputs)
 
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 7d1942554db77..b89d4862e1ef8 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -6,7 +6,7 @@
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.outputs import RequestOutput
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerOutput
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer, DetokenizerOutput
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -25,7 +25,7 @@ def __init__(
         request_id: str,
         prompt: Optional[str],
         prompt_token_ids: List[int],
-        detokenizer: Detokenizer,
+        detokenizer: IncrementalDetokenizer,
         queue: Optional[asyncio.Queue[RequestOutput]],
     ):
         self.request_id = request_id
@@ -47,7 +47,7 @@ def from_new_request(
             request_id=request.request_id,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
-            detokenizer=Detokenizer.from_new_request(
+            detokenizer=IncrementalDetokenizer.from_new_request(
                 tokenizer=tokenizer,
                 request=request,
             ),

From b9683d170efc542fe175eda95955f6bfee438e26 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:31:00 +0000
Subject: [PATCH 49/67] updared

---
 vllm/v1/engine/async_llm.py        | 4 +++-
 vllm/v1/engine/output_processor.py | 8 ++++++--
 vllm/v1/metrics/stats.py           | 2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f1bdce2c5e474..b86fbc3925fe9 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -133,7 +133,7 @@ async def add_request(
         """Add new request to the AsyncLLM."""
 
         # 1) Create a new output queue for the request.
-        if request_id in self.output_processor.request_states:
+        if self.output_processor.is_request_active(request_id):
             raise ValueError(f"Request id {request_id} already running.")
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
 
@@ -236,6 +236,8 @@ async def _run_output_handler(self):
                 # 2) Process EngineCoreOutputs.
                 processed_outputs = self.output_processor.process_outputs(
                     outputs.outputs)
+                # NOTE: RequestOutputs are pushed to their queues.
+                assert len(processed_outputs.request_outputs) == 0
 
                 # 3) Abort any reqs that finished due to stop strings.
                 await self.engine_core.abort_requests_async(
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index b89d4862e1ef8..2e43f45171f3e 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,11 +2,12 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
+from vllm.outputs import RequestOutput
 from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.outputs import RequestOutput
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import IncrementalDetokenizer, DetokenizerOutput
+from vllm.v1.engine.detokenizer import (DetokenizerOutput,
+                                        IncrementalDetokenizer)
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -67,6 +68,9 @@ def __init__(
         self.tokenizer = tokenizer
         self.request_states: Dict[str, RequestState] = {}
 
+    def is_request_active(self, request_id: str) -> bool:
+        return request_id in self.request_states
+
     def get_num_unfinished_requests(self):
         return len(self.request_states)
 
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 062f419f806bf..ac94b32fdd5f6 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,5 +1,5 @@
-from typing import TYPE_CHECKING
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreOutput

From 92c3b0c46bb18aa015f983191c3e45fc579a268f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 22:48:23 +0000
Subject: [PATCH 50/67] stash

---
 vllm/v1/engine/async_llm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b86fbc3925fe9..659452bbe0ec4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -222,6 +222,7 @@ async def generate(
         # generate() task will be canceled. So, we abort the
         # request if we end up here.
         except asyncio.CancelledError:
+            print("CANCELED!")
             await self.abort(request_id)
             raise
 
@@ -262,6 +263,9 @@ async def abort(self, request_id: str) -> None:
         await self.engine_core.abort_requests_async(request_ids)
         self.output_processor.abort_requests(request_ids)
 
+        if self.log_requests:
+            logger.info("Aborted request %s.", request_id)
+
     def _log_stats(
         self,
         scheduler_stats: SchedulerStats,

From a985a73bbd5aa88639d945a8889ce8b0fb44139e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 23:00:07 +0000
Subject: [PATCH 51/67] added logging and comment

---
 vllm/v1/engine/async_llm.py        | 1 -
 vllm/v1/engine/output_processor.py | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 659452bbe0ec4..a74699f7513e6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -222,7 +222,6 @@ async def generate(
         # generate() task will be canceled. So, we abort the
         # request if we end up here.
         except asyncio.CancelledError:
-            print("CANCELED!")
             await self.abort(request_id)
             raise
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 2e43f45171f3e..749f4f5043c97 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -126,6 +126,8 @@ def process_outputs(
             * IterationStats.update_from_output()
             * Detokenizer.update_from_output()
         
+        TODO(rob): add Protocol makes update_from_output explicit.
+        
         **********************************************************
         """
 

From 6c36d87e1049979beb6ca3b4f6e89c15a38ace09 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 23:08:47 +0000
Subject: [PATCH 52/67] starting to fix tests - stash

---
 tests/v1/engine/test_detokenizer.py | 218 ----------------------------
 1 file changed, 218 deletions(-)
 delete mode 100644 tests/v1/engine/test_detokenizer.py

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
deleted file mode 100644
index aeae697ca32b0..0000000000000
--- a/tests/v1/engine/test_detokenizer.py
+++ /dev/null
@@ -1,218 +0,0 @@
-from typing import List
-
-import pytest
-from transformers import AutoTokenizer
-
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import Detokenizer
-
-TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
-
-FULL_STRINGS = [
-    "My name is Robert from Neural Magic and I love working on vLLM so much!",
-    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
-    "Nick is the name of my brother in addition to my colleague from Red Hat.",
-]
-
-STOP_STRINGS = ["I love working on", "company by far", "brother in"]
-
-FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
-PROMPT_LEN = 5
-PROMPT_TOKENS = [
-    tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-]
-GENERATION_TOKENS = [
-    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
-]
-PROMPT_STRINGS = [
-    tokenizer.decode(prompt_tokens, skip_special_tokens=True)
-    for prompt_tokens in PROMPT_TOKENS
-]
-PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
-GENERATION_STRINGS = [
-    text[prompt_len:]
-    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
-]
-
-
-class MockEngineCore:
-    """Mock outputs form premade tokens lists."""
-
-    def __init__(self, tokens_list: List[List[int]]):
-        self.tokens_list = tokens_list
-        self.current_idx = 0
-
-    def get_outputs(self) -> List[EngineCoreOutput]:
-        token_idx = self.current_idx
-        self.current_idx += 1
-
-        outputs = []
-        for req_idx, token_ids in enumerate(self.tokens_list):
-            if len(token_ids) > token_idx:
-                output = EngineCoreOutput(request_id=f"request-{req_idx}",
-                                          new_token_ids=[token_ids[token_idx]],
-                                          finished=False)
-                if token_idx == len(token_ids) - 1:
-                    output.finished = True
-                    output.finish_reason = "stopped"
-                outputs.append(output)
-
-        return outputs
-
-
-@pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
-
-    # Make N requests.
-    requests = [
-        EngineCoreRequest(request_id=f"request-{idx}",
-                          prompt=prompt,
-                          prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_inputs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
-                          eos_token_id=None,
-                          lora_request=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False))
-        for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
-    ]
-
-    # Add requests to the detokenizer.
-    for request in requests:
-        detokenizer.add_request(request)
-
-    gen_strings = {}
-    gen_tokens = {}
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
-        assert len(requests_to_abort) == 0
-
-        # Update tracking.
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            new_tokens = request_output.outputs[0].token_ids
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-                gen_tokens[request_id] = new_tokens
-            else:
-                gen_strings[request_id] += new_text
-                gen_tokens[request_id].extend(new_tokens)
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
-        gen_str = gen_strings[f"request-{idx}"]
-        gen_toks = gen_tokens[f"request-{idx}"]
-
-        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
-        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
-
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
-
-
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-def test_stop_string(include_stop_str_in_output: bool):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
-
-    # Make N requests.
-    requests = [
-        EngineCoreRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_inputs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
-            eos_token_id=None,
-            lora_request=None,
-            sampling_params=SamplingParams(
-                skip_special_tokens=False,
-                spaces_between_special_tokens=False,
-                output_kind=RequestOutputKind.DELTA,
-                stop=STOP_STRINGS,
-                include_stop_str_in_output=include_stop_str_in_output,
-            )) for idx, (
-                prompt,
-                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
-    ]
-
-    # Add requests to the detokenizer.
-    for request in requests:
-        detokenizer.add_request(request)
-
-    gen_strings = {}
-    aborted = []
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
-        for request_output in request_outputs:
-            # If aborted, we should not get a request output.
-            assert request_output.request_id not in aborted
-        aborted.extend(requests_to_abort)
-
-        # Update tracking.
-        for request_output in request_outputs:
-            if request_output.finished:
-                assert request_output.outputs[0].finish_reason == "stop"
-
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-            else:
-                gen_strings[request_id] += new_text
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str,
-              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
-
-        # Request should be aborted.
-        request_id = f"request-{idx}"
-        assert request_id in aborted
-
-        # Collected values that were generated.
-        gen_str = gen_strings[request_id]
-
-        # Construct reference strings.
-        stop_str_idx = ref_gen_str.find(stop_str)
-        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
-        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
-
-        if include_stop_str_in_output:
-            assert gen_str == ref_str_inc_stop, (
-                f"{gen_str=}, {ref_str_inc_stop=}")
-        else:
-            assert gen_str == ref_str_exc_stop, (
-                f"{gen_str=}, {ref_str_exc_stop=}")
-
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()

From 595fd122c31032c3aa03d16a86c5332246132461 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 23:36:06 +0000
Subject: [PATCH 53/67] updated tests

---
 tests/v1/engine/test_output_processor.py | 222 +++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 tests/v1/engine/test_output_processor.py

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
new file mode 100644
index 0000000000000..c0c335d03b50a
--- /dev/null
+++ b/tests/v1/engine/test_output_processor.py
@@ -0,0 +1,222 @@
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.config import VllmConfig
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.output_processor import OutputProcessor
+
+TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+
+FULL_STRINGS = [
+    "My name is Robert from Neural Magic and I love working on vLLM so much!",
+    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
+    "Nick is the name of my brother in addition to my colleague from Red Hat.",
+]
+
+STOP_STRINGS = ["I love working on", "company by far", "brother in"]
+
+FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
+PROMPT_LEN = 5
+PROMPT_TOKENS = [
+    tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+]
+GENERATION_TOKENS = [
+    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+]
+PROMPT_STRINGS = [
+    tokenizer.decode(prompt_tokens, skip_special_tokens=True)
+    for prompt_tokens in PROMPT_TOKENS
+]
+PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
+GENERATION_STRINGS = [
+    text[prompt_len:]
+    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
+]
+
+
+class MockEngineCore:
+    """Mock outputs form premade tokens lists."""
+
+    def __init__(self, tokens_list: List[List[int]]):
+        self.tokens_list = tokens_list
+        self.current_idx = 0
+
+    def get_outputs(self) -> List[EngineCoreOutput]:
+        token_idx = self.current_idx
+        self.current_idx += 1
+
+        outputs = []
+        for req_idx, token_ids in enumerate(self.tokens_list):
+            if len(token_ids) > token_idx:
+                output = EngineCoreOutput(request_id=f"request-{req_idx}",
+                                          new_token_ids=[token_ids[token_idx]],
+                                          finished=False)
+                if token_idx == len(token_ids) - 1:
+                    output.finished = True
+                    output.finish_reason = "stopped"
+                outputs.append(output)
+
+        return outputs
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+def test_incremental_detokenization(request_output_kind: RequestOutputKind):
+    output_processor = OutputProcessor(TOKENIZER_NAME, log_stats=False)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False))
+        for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        output_processor.add_request(request)
+
+    gen_strings = {}
+    gen_tokens = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs,)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
+            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
+        gen_str = gen_strings[f"request-{idx}"]
+        gen_toks = gen_tokens[f"request-{idx}"]
+
+        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
+        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+def test_stop_string(include_stop_str_in_output: bool):
+    detokenizer = OutputProcessor(TOKENIZER_NAME, log_stats=False)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+            )) for idx, (
+                prompt,
+                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        detokenizer.add_request(request)
+
+    gen_strings = {}
+    aborted = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        for request_output in request_outputs:
+            # If aborted, we should not get a request output.
+            assert request_output.request_id not in aborted
+        aborted.extend(requests_to_abort)
+
+        # Update tracking.
+        for request_output in request_outputs:
+            if request_output.finished:
+                assert request_output.outputs[0].finish_reason == "stop"
+
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+            else:
+                gen_strings[request_id] += new_text
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str,
+              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
+
+        # Request should be aborted.
+        request_id = f"request-{idx}"
+        assert request_id in aborted
+
+        # Collected values that were generated.
+        gen_str = gen_strings[request_id]
+
+        # Construct reference strings.
+        stop_str_idx = ref_gen_str.find(stop_str)
+        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
+        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
+
+        if include_stop_str_in_output:
+            assert gen_str == ref_str_inc_stop, (
+                f"{gen_str=}, {ref_str_inc_stop=}")
+        else:
+            assert gen_str == ref_str_exc_stop, (
+                f"{gen_str=}, {ref_str_exc_stop=}")
+
+    assert detokenizer.get_num_unfinished_requests() == 0
+    assert not detokenizer.has_unfinished_requests()

From 5ecfe8e03410b3717b2fcd2a610277f2559b1a22 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 23:41:28 +0000
Subject: [PATCH 54/67] make tests pass

---
 tests/v1/engine/test_output_processor.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index c0c335d03b50a..3fcd796a5ba4e 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -3,15 +3,20 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.engine.output_processor import OutputProcessor
 
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+TOKENIZER_GROUP = init_tokenizer_from_configs(
+    VLLM_CONFIG.model_config, VLLM_CONFIG.scheduler_config,
+    VLLM_CONFIG.parallel_config, VLLM_CONFIG.lora_config)
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
+
 FULL_STRINGS = [
     "My name is Robert from Neural Magic and I love working on vLLM so much!",
     "Red Hat is the best open source company by far across Linux, K8s, and AI.",
@@ -68,7 +73,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
     "request_output_kind",
     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    output_processor = OutputProcessor(TOKENIZER_NAME, log_stats=False)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -138,7 +143,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 def test_stop_string(include_stop_str_in_output: bool):
-    detokenizer = OutputProcessor(TOKENIZER_NAME, log_stats=False)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -166,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Add requests to the detokenizer.
     for request in requests:
-        detokenizer.add_request(request)
+        output_processor.add_request(request)
 
     gen_strings = {}
     aborted = []
@@ -177,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool):
             break
 
         # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
         for request_output in request_outputs:
             # If aborted, we should not get a request output.
             assert request_output.request_id not in aborted
@@ -218,5 +225,5 @@ def test_stop_string(include_stop_str_in_output: bool):
             assert gen_str == ref_str_exc_stop, (
                 f"{gen_str=}, {ref_str_exc_stop=}")
 
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()

From 5f37918273d27f84fdd5ce52ff3a382795234968 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 23:42:00 +0000
Subject: [PATCH 55/67] reduce LOC changes

---
 tests/v1/engine/test_output_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 3fcd796a5ba4e..5d82682fa1e18 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -16,7 +16,6 @@
     VLLM_CONFIG.parallel_config, VLLM_CONFIG.lora_config)
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
-
 FULL_STRINGS = [
     "My name is Robert from Neural Magic and I love working on vLLM so much!",
     "Red Hat is the best open source company by far across Linux, K8s, and AI.",

From 1d9b2337894b6724167e80cb90db073b751bd956 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 12 Jan 2025 23:50:57 +0000
Subject: [PATCH 56/67] updated

---
 tests/v1/engine/test_output_processor.py | 31 ++++++++++++++++++------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 5d82682fa1e18..8d92bc4508029 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -142,7 +142,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 def test_stop_string(include_stop_str_in_output: bool):
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -157,13 +157,7 @@ def test_stop_string(include_stop_str_in_output: bool):
             mm_placeholders=None,
             eos_token_id=None,
             lora_request=None,
-            sampling_params=SamplingParams(
-                skip_special_tokens=False,
-                spaces_between_special_tokens=False,
-                output_kind=RequestOutputKind.DELTA,
-                stop=STOP_STRINGS,
-                include_stop_str_in_output=include_stop_str_in_output,
-            )) for idx, (
+            sampling_params=SamplingParams()) for idx, (
                 prompt,
                 prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
@@ -226,3 +220,24 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     assert output_processor.get_num_unfinished_requests() == 0
     assert not output_processor.has_unfinished_requests()
+
+
+def test_iteration_stats():
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams()
+        ) for idx, (prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
\ No newline at end of file

From 288096214299fc91bddc5a07843a6aa4c701a6af Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 01:26:43 +0000
Subject: [PATCH 57/67] added IterationStats test

---
 tests/v1/engine/test_output_processor.py | 54 +++++++++++++++++++++---
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 8d92bc4508029..2702886125b30 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -142,7 +142,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 def test_stop_string(include_stop_str_in_output: bool):
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -223,7 +223,7 @@ def test_stop_string(include_stop_str_in_output: bool):
 
 
 def test_iteration_stats():
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
     engine_core = MockEngineCore(GENERATION_TOKENS)
 
     # Make N requests.
@@ -238,6 +238,50 @@ def test_iteration_stats():
             mm_placeholders=None,
             eos_token_id=None,
             lora_request=None,
-            sampling_params=SamplingParams()
-        ) for idx, (prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
-    ]
\ No newline at end of file
+            sampling_params=SamplingParams(),
+        ) for idx, (
+            prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add all requests except one to the OutputProcessor.
+    num_active = len(GENERATION_TOKENS) - 1
+    for request in requests[:num_active]:
+        output_processor.add_request(request)
+    inactive_request = requests[num_active]
+
+    # First iteration has 2 prefills.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = sum(
+        [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Add a new requrest - prefill and 2 decodes in this step.
+    output_processor.add_request(inactive_request)
+    num_active += 1
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active

From 7de7c00113e0d89442c9ab18e7004b364de043aa Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 01:28:22 +0000
Subject: [PATCH 58/67] codespell

---
 tests/v1/engine/test_output_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 2702886125b30..912bac9513c86 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -267,7 +267,7 @@ def test_iteration_stats():
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
 
-    # Add a new requrest - prefill and 2 decodes in this step.
+    # Add a new request - prefill and 2 decodes in this step.
     output_processor.add_request(inactive_request)
     num_active += 1
     outputs = engine_core.get_outputs()[:num_active]

From eec573cd48e9d06973b00a9cfcbe36727269b132 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 01:33:53 +0000
Subject: [PATCH 59/67] add comment about invairant

---
 vllm/v1/metrics/stats.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index ac94b32fdd5f6..381ee1dd29251 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -31,4 +31,9 @@ def update_from_output(self, output: "EngineCoreOutput",
 
         self.num_generation_tokens += len(output.new_token_ids)
         if is_prefilling:
+            # This relies on the invariant that EngineCore does
+            # not stream outputs for partially completed prefills
+            # (scheduler.update_from_output makes EngineCoreOutput
+            # iff num_computed_tokens == num_tokens).
+            assert(output.new_token_ids > 1)
             self.num_prompt_tokens += prompt_len

From 0427e03a5b782da4fb6bc5bfebfc3420b7630e90 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 01:44:18 +0000
Subject: [PATCH 60/67] updated

---
 tests/v1/engine/test_async_llm.py | 53 +++++++++++++++++++++++++++++++
 vllm/v1/metrics/stats.py          |  2 +-
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index fffb5b8100ec7..7d244c2f8e40e 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -66,4 +66,57 @@ async def test_load(monkeypatch):
             f"{failed_request_id} generated {tokens} but "
             f"expected {NUM_EXPECTED_TOKENS}")
 
+        # Make sure RequestStates get cleaned up.
+        assert not engine.output_processor.has_unfinished_requests()
         engine.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_abort(monkeypatch):
+    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
+    # so that in the future when we switch, we don't have to change all the
+    # tests.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 100
+        REQUEST_IDS_TO_ABORT = [1 + idx * 10 for idx in range(10)]
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+        
+        # API server cancels requests when they are aborted.
+        for idx in REQUEST_IDS_TO_ABORT:
+            tasks[idx].cancel()
+
+        # Confirm the other requests are okay.
+        failed_request_id = None
+        tokens = None
+        for idx, task in enumerate(tasks):
+            
+            # Confirm that it was actually canceled.
+            if idx in REQUEST_IDS_TO_ABORT:
+                with pytest.raises(asyncio.CancelledError):
+                    await task
+
+            # Otherwise, make sure the request was not impacted.
+            num_generated_tokens, request_id = await task
+            if (num_generated_tokens != NUM_EXPECTED_TOKENS
+                    and failed_request_id is None):
+                failed_request_id = request_id
+                tokens = num_generated_tokens
+        
+        assert failed_request_id is None, (
+            f"{failed_request_id} generated {tokens} but "
+            f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
\ No newline at end of file
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 381ee1dd29251..bd5fbc922f679 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -35,5 +35,5 @@ def update_from_output(self, output: "EngineCoreOutput",
             # not stream outputs for partially completed prefills
             # (scheduler.update_from_output makes EngineCoreOutput
             # iff num_computed_tokens == num_tokens).
-            assert(output.new_token_ids > 1)
+            assert(len(output.new_token_ids) > 1)
             self.num_prompt_tokens += prompt_len

From 9b49133885b6760ce056a65e937fe2bad2dcf24d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 01:46:56 +0000
Subject: [PATCH 61/67] tweak

---
 tests/v1/engine/test_async_llm.py | 12 ++++++++++--
 vllm/v1/metrics/stats.py          |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 7d244c2f8e40e..ef2f947d4a963 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -66,7 +66,6 @@ async def test_load(monkeypatch):
             f"{failed_request_id} generated {tokens} but "
             f"expected {NUM_EXPECTED_TOKENS}")
 
-        # Make sure RequestStates get cleaned up.
         assert not engine.output_processor.has_unfinished_requests()
         engine.shutdown()
 
@@ -119,4 +118,13 @@ async def test_abort(monkeypatch):
             f"{failed_request_id} generated {tokens} but "
             f"expected {NUM_EXPECTED_TOKENS}")
 
-        assert not engine.output_processor.has_unfinished_requests()
\ No newline at end of file
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can do another generation.
+        task = asyncio.create_task(generate(
+            engine, REQUEST_IDS_TO_ABORT[0], NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
+
+        engine.shutdown()
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index bd5fbc922f679..ddd220c4b9be6 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -35,5 +35,5 @@ def update_from_output(self, output: "EngineCoreOutput",
             # not stream outputs for partially completed prefills
             # (scheduler.update_from_output makes EngineCoreOutput
             # iff num_computed_tokens == num_tokens).
-            assert(len(output.new_token_ids) > 1)
+            assert(len(output.new_token_ids) > 0)
             self.num_prompt_tokens += prompt_len

From bffa5d011fd5474fab0afb8772bcd3f66e838737 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 02:06:26 +0000
Subject: [PATCH 62/67] formatting and added test

---
 tests/v1/engine/test_async_llm.py | 52 +++++++++++--------------------
 vllm/v1/metrics/stats.py          |  2 +-
 2 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index ef2f947d4a963..6764ee799abb0 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Tuple
+from typing import List, Tuple
 
 import pytest
 
@@ -13,6 +13,7 @@
                 allow_module_level=True)
 
 ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+                              enforce_eager=True,
                               disable_log_requests=True)
 
 
@@ -53,18 +54,11 @@ async def test_load(monkeypatch):
                     generate(engine, request_id, NUM_EXPECTED_TOKENS)))
 
         # Confirm that we got all the EXPECTED tokens from the requests.
-        failed_request_id = None
-        tokens = None
         for task in tasks:
             num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
-                failed_request_id = request_id
-                tokens = num_generated_tokens
-
-        assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
 
         assert not engine.output_processor.has_unfinished_requests()
         engine.shutdown()
@@ -72,9 +66,7 @@ async def test_load(monkeypatch):
 
 @pytest.mark.asyncio
 async def test_abort(monkeypatch):
-    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
-    # so that in the future when we switch, we don't have to change all the
-    # tests.
+
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -82,47 +74,41 @@ async def test_abort(monkeypatch):
 
         NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 100
-        REQUEST_IDS_TO_ABORT = [1 + idx * 10 for idx in range(10)]
+        REQUEST_IDS_TO_ABORT = range(1, 100, 10)
 
         request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
 
         # Create concurrent requests.
-        tasks = []
+        tasks: List[asyncio.Task] = []
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
                     generate(engine, request_id, NUM_EXPECTED_TOKENS)))
-        
+
         # API server cancels requests when they are aborted.
         for idx in REQUEST_IDS_TO_ABORT:
             tasks[idx].cancel()
+            await asyncio.sleep(0.05)
 
         # Confirm the other requests are okay.
-        failed_request_id = None
-        tokens = None
         for idx, task in enumerate(tasks):
-            
             # Confirm that it was actually canceled.
             if idx in REQUEST_IDS_TO_ABORT:
                 with pytest.raises(asyncio.CancelledError):
                     await task
-
-            # Otherwise, make sure the request was not impacted.
-            num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
-                failed_request_id = request_id
-                tokens = num_generated_tokens
-        
-        assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
+            else:
+                # Otherwise, make sure the request was not impacted.
+                num_generated_tokens, request_id = await task
+                assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                    f"{request_id} generated {num_generated_tokens} but "
+                    f"expected {NUM_EXPECTED_TOKENS}")
 
         assert not engine.output_processor.has_unfinished_requests()
 
         # Confirm we can do another generation.
-        task = asyncio.create_task(generate(
-            engine, REQUEST_IDS_TO_ABORT[0], NUM_EXPECTED_TOKENS))
+        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
+        task = asyncio.create_task(
+            generate(engine, request_id, NUM_EXPECTED_TOKENS))
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index ddd220c4b9be6..60cb986f8bbce 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -35,5 +35,5 @@ def update_from_output(self, output: "EngineCoreOutput",
             # not stream outputs for partially completed prefills
             # (scheduler.update_from_output makes EngineCoreOutput
             # iff num_computed_tokens == num_tokens).
-            assert(len(output.new_token_ids) > 0)
+            assert (len(output.new_token_ids) > 0)
             self.num_prompt_tokens += prompt_len

From 605c5f0d8f8a0f65f2fd5d81a66badfc7975cbdb Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 02:07:35 +0000
Subject: [PATCH 63/67] passing

---
 tests/v1/engine/test_async_llm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 6764ee799abb0..2c805e18eebae 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -85,10 +85,10 @@ async def test_abort(monkeypatch):
                 asyncio.create_task(
                     generate(engine, request_id, NUM_EXPECTED_TOKENS)))
 
-        # API server cancels requests when they are aborted.
+        # API server cancels requests when they disconnect.
         for idx in REQUEST_IDS_TO_ABORT:
             tasks[idx].cancel()
-            await asyncio.sleep(0.05)
+            await asyncio.sleep(0.1)
 
         # Confirm the other requests are okay.
         for idx, task in enumerate(tasks):

From d0013a48ae5a1d9d1c43f319323793ee11528920 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 02:09:30 +0000
Subject: [PATCH 64/67] ruff ruff

---
 tests/v1/engine/test_output_processor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 912bac9513c86..920b12459d93d 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -239,8 +239,8 @@ def test_iteration_stats():
             eos_token_id=None,
             lora_request=None,
             sampling_params=SamplingParams(),
-        ) for idx, (
-            prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        ) for idx, (prompt, prompt_tokens) in enumerate(
+            zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
 
     # Add all requests except one to the OutputProcessor.

From e01d236f7acd468a7a3c79abb12c057cdc789747 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 02:27:09 +0000
Subject: [PATCH 65/67] format

---
 tests/v1/engine/test_output_processor.py | 40 +++++++++++++-----------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 920b12459d93d..c04e170a9f433 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -11,9 +11,10 @@
 
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
-TOKENIZER_GROUP = init_tokenizer_from_configs(
-    VLLM_CONFIG.model_config, VLLM_CONFIG.scheduler_config,
-    VLLM_CONFIG.parallel_config, VLLM_CONFIG.lora_config)
+TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
+                                              VLLM_CONFIG.scheduler_config,
+                                              VLLM_CONFIG.parallel_config,
+                                              VLLM_CONFIG.lora_config)
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
 FULL_STRINGS = [
@@ -110,7 +111,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
             break
 
         # Step the Detokenizer.
-        processed_outputs = output_processor.process_outputs(outputs,)
+        processed_outputs = output_processor.process_outputs(outputs, )
         request_outputs = processed_outputs.request_outputs
         requests_to_abort = processed_outputs.reqs_to_abort
         assert len(requests_to_abort) == 0
@@ -147,19 +148,19 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Make N requests.
     requests = [
-        EngineCoreRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_inputs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
-            eos_token_id=None,
-            lora_request=None,
-            sampling_params=SamplingParams()) for idx, (
-                prompt,
-                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams())
+        for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
 
     # Add requests to the detokenizer.
@@ -239,8 +240,9 @@ def test_iteration_stats():
             eos_token_id=None,
             lora_request=None,
             sampling_params=SamplingParams(),
-        ) for idx, (prompt, prompt_tokens) in enumerate(
-            zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
 
     # Add all requests except one to the OutputProcessor.

From a53f089c11925adaa215d17fb83e63b1b4418e83 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 02:36:12 +0000
Subject: [PATCH 66/67] run isort

---
 tests/v1/engine/test_output_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index c04e170a9f433..57e3e2a649839 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -4,8 +4,8 @@
 from transformers import AutoTokenizer
 
 from vllm.engine.arg_utils import EngineArgs
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.engine.output_processor import OutputProcessor
 

From 3e45fc66144f9a5088cb605e2d1b96558e815a98 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 03:00:04 +0000
Subject: [PATCH 67/67] undo fat finger

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/engine/test_output_processor.py | 32 ++++++++++++++----------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 57e3e2a649839..4735c6f947537 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -148,19 +148,25 @@ def test_stop_string(include_stop_str_in_output: bool):
 
     # Make N requests.
     requests = [
-        EngineCoreRequest(request_id=f"request-{idx}",
-                          prompt=prompt,
-                          prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_inputs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
-                          eos_token_id=None,
-                          lora_request=None,
-                          sampling_params=SamplingParams())
-        for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+            )) for idx, (
+                prompt,
+                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
     ]
 
     # Add requests to the detokenizer.