Skip to content

Commit

Permalink
Dynamic scheduler delay to improve ITL performance (#3279)
Browse files Browse the repository at this point in the history
Co-authored-by: Jan van Lunteren <[email protected]>
  • Loading branch information
tdoublep and jvlunteren authored Mar 22, 2024
1 parent f721096 commit cf2f084
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 2 deletions.
34 changes: 34 additions & 0 deletions tests/core/test_scheduler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List
import pytest # noqa
import time

from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler
Expand Down Expand Up @@ -168,3 +169,36 @@ def test_scheduler_max_seqs():
# and one is prompting.
_, out = scheduler.schedule()
assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]])


def test_scheduler_delay_factor():

block_size = 4
scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
cache_config.num_gpu_blocks = 8
scheduler = Scheduler(scheduler_config, cache_config, None)

# schedule first prompt
_, seq_group = create_dummy_prompt("0", prompt_length=block_size)
scheduler.add_seq_group(seq_group)
seq_group_meta, out = scheduler.schedule()
assert out.prompt_run
assert seq_group_meta[0].request_id == '0'

# wait for a second before scheduling next prompt
time.sleep(1)
_, seq_group = create_dummy_prompt("1", prompt_length=block_size)
scheduler.add_seq_group(seq_group)

# second prompt should *not* be scheduled
seq_group_meta, out = scheduler.schedule()
assert not out.prompt_run
assert seq_group_meta[0].request_id == '0'

# wait for more than 0.5 second and try again
time.sleep(0.6)
seq_group_meta, out = scheduler.schedule()
assert out.prompt_run
assert seq_group_meta[0].request_id == '1'
4 changes: 4 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,13 +517,16 @@ class SchedulerConfig:
iteration.
max_model_len: Maximum length of a sequence (including prompt
and generated text).
delay_factor: Apply a delay (of delay factor multiplied by previous
prompt latency) before scheduling next prompt.
"""

def __init__(
self,
max_num_batched_tokens: Optional[int],
max_num_seqs: int,
max_model_len: int,
delay_factor: float = 0.0,
) -> None:
if max_num_batched_tokens is not None:
self.max_num_batched_tokens = max_num_batched_tokens
Expand All @@ -533,6 +536,7 @@ def __init__(
self.max_num_batched_tokens = max(max_model_len, 2048)
self.max_num_seqs = max_num_seqs
self.max_model_len = max_model_len
self.delay_factor = delay_factor
self._verify_args()

def _verify_args(self) -> None:
Expand Down
26 changes: 25 additions & 1 deletion vllm/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ def __init__(
# Sequence groups in the SWAPPED state.
self.swapped: Deque[SequenceGroup] = deque()

# Time at previous scheduling step
self.prev_time = 0.0
# Did we schedule a prompt at previous step?
self.prev_prompt = False
# Latency of the last prompt step
self.last_prompt_latency = 0.0

@property
def lora_enabled(self) -> bool:
return bool(self.lora_config)
Expand Down Expand Up @@ -179,7 +186,7 @@ def _schedule(self) -> SchedulerOutputs:
# are added to the back.
leftover_waiting_sequences = deque()
num_batched_tokens = 0
while self.waiting:
while self._passed_delay(now) and self.waiting:
seq_group = self.waiting[0]
waiting_seqs = seq_group.get_seqs(
status=SequenceStatus.WAITING)
Expand Down Expand Up @@ -246,6 +253,7 @@ def _schedule(self) -> SchedulerOutputs:
self.waiting.extendleft(leftover_waiting_sequences)

if scheduled or ignored_seq_groups:
self.prev_prompt = True
scheduler_outputs = SchedulerOutputs(
scheduled_seq_groups=scheduled,
prompt_run=True,
Expand Down Expand Up @@ -491,3 +499,19 @@ def _swap_out(

def mark_blocks_as_computed(self, seq_group: SequenceGroup):
self.block_manager.mark_blocks_as_computed(seq_group)

def _passed_delay(self, now: float) -> bool:
if self.prev_prompt:
self.last_prompt_latency = now - self.prev_time
self.prev_time, self.prev_prompt = now, False
# Delay scheduling prompts to let waiting queue fill up
if self.scheduler_config.delay_factor > 0 and self.waiting:
earliest_arrival_time = min(
[e.metrics.arrival_time for e in self.waiting])
passed_delay = (
(now - earliest_arrival_time) >
(self.scheduler_config.delay_factor * self.last_prompt_latency)
or not self.running)
else:
passed_delay = True
return passed_delay
10 changes: 9 additions & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class EngineArgs:
max_cpu_loras: Optional[int] = None
device: str = 'auto'
ray_workers_use_nsight: bool = False
scheduler_delay_factor: float = 0.0

def __post_init__(self):
if self.tokenizer is None:
Expand Down Expand Up @@ -305,6 +306,12 @@ def add_cli_args(
default=EngineArgs.device,
choices=["auto", "cuda", "neuron"],
help='Device type for vLLM execution.')
parser.add_argument(
'--scheduler-delay-factor',
type=float,
default=EngineArgs.scheduler_delay_factor,
help='Apply a delay (of delay factor multiplied by previous'
'prompt latency) before scheduling next prompt.')
return parser

@classmethod
Expand Down Expand Up @@ -342,7 +349,8 @@ def create_engine_configs(
), self.ray_workers_use_nsight)
scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
self.max_num_seqs,
model_config.max_model_len)
model_config.max_model_len,
self.scheduler_delay_factor)
lora_config = LoRAConfig(
max_lora_rank=self.max_lora_rank,
max_loras=self.max_loras,
Expand Down

0 comments on commit cf2f084

Please sign in to comment.