From 7902fa28603a5d3a7ff9191c6b07da5eb3cf38b1 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Thu, 14 Nov 2024 10:31:11 +0200 Subject: [PATCH] Fix number of blocks when profiling contiguous pa --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index f67d604234fbe..22aa5bf5a7fc9 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1121,8 +1121,9 @@ def _prepare_decode( padding_fn = None if self.use_contiguous_pa: + block_bucket_size = max(max(block_list) + 1, len(block_list)) block_bucket_size = find_bucket( - max(block_list) + 1, + block_bucket_size, self.bucketing_global_state.decode_block_bucket_cfg) indices: List[Any] indices = [None] * block_bucket_size