fix PP : avoid using sampling metadata cache

vllm-project · Aug 25, 2024 · 976d032 · 976d032
1 parent 6eac258
commit 976d032
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 2 deletions.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1395,7 +1395,9 @@ def prepare_model_input(
             sampling_metadata = SamplingMetadata.prepare(
                 seq_group_metadata_list, model_input.seq_lens,
                 model_input.query_lens, self.device, self.pin_memory,
-                generators, self.sampling_metadata_cache)
+                generators,
+                # TODO(varun) : Fix sampling metadata cache impl.
+                None)
         else:
             sampling_metadata = None
         is_prompt = (seq_group_metadata_list[0].is_prompt

diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
@@ -305,7 +305,8 @@ def prepare_model_input(
                         frozen_model_input.seq_lens[num_prompts:],
                         frozen_model_input.query_lens[num_prompts:],
                         self.device, self.pin_memory, generators,
-                        self.sampling_metadata_cache)
+                        # TODO (varun) : Fix sampling metadata cache impl
+                        None)
                     sampling_metadata_decodes.skip_sampler_cpu_output = (True)
 
         model_input = StatefulModelInput(