From f97e0ae66bb8ac0843d129f74c426c570ec896dc Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 20 Aug 2024 21:42:52 -0400 Subject: [PATCH 01/47] added example --- examples/offline_inference_multi_step.py | 36 ++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 examples/offline_inference_multi_step.py diff --git a/examples/offline_inference_multi_step.py b/examples/offline_inference_multi_step.py new file mode 100644 index 0000000000000..79e0fca6bccf3 --- /dev/null +++ b/examples/offline_inference_multi_step.py @@ -0,0 +1,36 @@ +''' +Example of setting up LLM with multi-step enabled. +In actuality, async engine would be a more sensible choice +from a performance perspective. However this example is useful +for demonstration & debugging of multi-step code. +''' + +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM( + model="JackFram/llama-160m", + swap_space=16, + tensor_parallel_size=1, + gpu_memory_utilization=0.9, + num_scheduler_steps=8, + use_v2_block_manager=True, +) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") From f969241461d39a8c5cc594ee8f3cb42866d1f0e5 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 20 Aug 2024 22:09:50 -0400 Subject: [PATCH 02/47] wip: --- vllm/worker/multi_step_model_runner.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 521205eca05af..de3de95e16881 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -10,6 +10,7 @@ import torch +from vllm.model_executor.layers.sampler import _get_logprobs from vllm import _custom_ops as ops from vllm.distributed import get_pp_group from vllm.logger import init_logger @@ -295,16 +296,18 @@ def execute_model( model_input.cached_outputs.append( ModelOutput(output[0], output_ready_event, output[0].sampled_token_ids, False)) - # make sure we dont try to serialize any GPU tensors - output[0].sampled_token_ids = None - output[0].sampled_token_probs = None - output[0].logprobs = None + # Pythonize the output if CPU is ahead and the previous step is # ready. for model_output in model_input.cached_outputs: model_output.maybe_pythonize(model_input, self._copy_stream, self.pinned_sampled_token_ids) + # make sure we dont try to serialize any GPU tensors + output[0].sampled_token_ids = None + output[0].sampled_token_probs = None + output[0].logprobs = None + model_input.current_step += 1 if not get_pp_group().is_last_rank: From 642d31b814b3dceab9636dcb3c1367ea5968699d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 09:39:16 -0400 Subject: [PATCH 03/47] first working attempt at logprobs --- examples/offline_inference_multi_step.py | 1 + tests/spec_decode/test_multi_step_worker.py | 3 +- tests/spec_decode/test_spec_decode_worker.py | 3 +- tests/spec_decode/utils.py | 4 +- tests/test_sequence.py | 5 +- vllm/engine/async_llm_engine.py | 4 +- vllm/engine/llm_engine.py | 6 +- vllm/engine/output_processor/util.py | 3 +- vllm/engine/protocol.py | 2 +- vllm/executor/cpu_executor.py | 3 +- vllm/executor/distributed_gpu_executor.py | 3 +- vllm/executor/executor_base.py | 3 +- vllm/executor/gpu_executor.py | 3 +- vllm/executor/multiproc_gpu_executor.py | 3 +- vllm/executor/neuron_executor.py | 3 +- vllm/executor/openvino_executor.py | 3 +- vllm/executor/ray_gpu_executor.py | 3 +- vllm/executor/ray_tpu_executor.py | 3 +- vllm/executor/ray_xpu_executor.py | 3 +- vllm/executor/tpu_executor.py | 3 +- vllm/executor/xpu_executor.py | 3 +- vllm/model_executor/layers/sampler.py | 255 +++++++++++++++--- vllm/model_executor/model_loader/neuron.py | 3 +- vllm/model_executor/model_loader/openvino.py | 3 +- vllm/model_executor/models/arctic.py | 4 +- vllm/model_executor/models/baichuan.py | 4 +- vllm/model_executor/models/bart.py | 4 +- vllm/model_executor/models/blip2.py | 4 +- vllm/model_executor/models/bloom.py | 4 +- vllm/model_executor/models/chameleon.py | 4 +- vllm/model_executor/models/chatglm.py | 4 +- vllm/model_executor/models/commandr.py | 4 +- vllm/model_executor/models/dbrx.py | 4 +- vllm/model_executor/models/deepseek.py | 4 +- vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/falcon.py | 4 +- vllm/model_executor/models/fuyu.py | 3 +- vllm/model_executor/models/gemma.py | 4 +- vllm/model_executor/models/gemma2.py | 4 +- vllm/model_executor/models/gpt2.py | 4 +- vllm/model_executor/models/gpt_bigcode.py | 4 +- vllm/model_executor/models/gpt_j.py | 4 +- vllm/model_executor/models/gpt_neox.py | 4 +- vllm/model_executor/models/internlm2.py | 4 +- vllm/model_executor/models/internvl.py | 3 +- vllm/model_executor/models/jais.py | 4 +- vllm/model_executor/models/jamba.py | 4 +- vllm/model_executor/models/llama.py | 4 +- vllm/model_executor/models/llava.py | 3 +- vllm/model_executor/models/llava_next.py | 3 +- vllm/model_executor/models/medusa.py | 2 +- vllm/model_executor/models/minicpm.py | 4 +- vllm/model_executor/models/minicpmv.py | 4 +- vllm/model_executor/models/mixtral.py | 4 +- vllm/model_executor/models/mixtral_quant.py | 4 +- vllm/model_executor/models/mlp_speculator.py | 3 +- vllm/model_executor/models/mpt.py | 4 +- vllm/model_executor/models/nemotron.py | 4 +- vllm/model_executor/models/olmo.py | 4 +- vllm/model_executor/models/opt.py | 4 +- vllm/model_executor/models/orion.py | 4 +- vllm/model_executor/models/paligemma.py | 4 +- vllm/model_executor/models/persimmon.py | 4 +- vllm/model_executor/models/phi.py | 4 +- vllm/model_executor/models/phi3_small.py | 4 +- vllm/model_executor/models/phi3v.py | 4 +- vllm/model_executor/models/qwen.py | 4 +- vllm/model_executor/models/qwen2.py | 4 +- vllm/model_executor/models/qwen2_moe.py | 4 +- vllm/model_executor/models/stablelm.py | 4 +- vllm/model_executor/models/starcoder2.py | 4 +- vllm/model_executor/models/xverse.py | 4 +- vllm/sequence.py | 66 ----- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/draft_model_runner.py | 4 +- vllm/spec_decode/medusa_worker.py | 4 +- vllm/spec_decode/mlp_speculator_worker.py | 4 +- vllm/spec_decode/multi_step_worker.py | 3 +- vllm/spec_decode/ngram_worker.py | 3 +- vllm/spec_decode/proposer_worker_base.py | 3 +- .../spec_decode/smaller_tp_proposer_worker.py | 3 +- vllm/spec_decode/spec_decode_worker.py | 3 +- vllm/spec_decode/top1_proposer.py | 4 +- vllm/spec_decode/util.py | 4 +- vllm/worker/cpu_model_runner.py | 4 +- vllm/worker/enc_dec_model_runner.py | 3 +- vllm/worker/model_runner.py | 4 +- vllm/worker/model_runner_base.py | 4 +- vllm/worker/multi_step_model_runner.py | 85 +++++- vllm/worker/multi_step_worker.py | 3 +- vllm/worker/neuron_model_runner.py | 4 +- vllm/worker/openvino_model_runner.py | 3 +- vllm/worker/openvino_worker.py | 3 +- vllm/worker/tpu_model_runner.py | 4 +- vllm/worker/worker.py | 4 +- vllm/worker/worker_base.py | 4 +- vllm/worker/xpu_model_runner.py | 4 +- 97 files changed, 468 insertions(+), 278 deletions(-) diff --git a/examples/offline_inference_multi_step.py b/examples/offline_inference_multi_step.py index 79e0fca6bccf3..643b53875e0ed 100644 --- a/examples/offline_inference_multi_step.py +++ b/examples/offline_inference_multi_step.py @@ -25,6 +25,7 @@ gpu_memory_utilization=0.9, num_scheduler_steps=8, use_v2_block_manager=True, + enforce_eager=True, ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 442e40f07f0bb..b014b4b33e1a1 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -5,8 +5,9 @@ import pytest import torch +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput +from vllm.sequence import ExecuteModelRequest, Logprob from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 9ae1b4bc40f0f..cbaffee2f41e2 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -7,8 +7,9 @@ import pytest import torch +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput +from vllm.sequence import ExecuteModelRequest, SequenceOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 60b36a33d9077..9075a433eb66e 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -8,12 +8,12 @@ import torch from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.utils import set_random_seed from vllm.sampling_params import SamplingParams from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, CompletionSequenceGroupOutput, Logprob, - SamplerOutput, SequenceData, SequenceGroupMetadata, - SequenceOutput) + SequenceData, SequenceGroupMetadata, SequenceOutput) from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 1ae349e808e0d..348ba7dd41d99 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -2,9 +2,10 @@ import pytest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, - CompletionSequenceGroupOutput, SamplerOutput, - SequenceData, SequenceOutput) + CompletionSequenceGroupOutput, SequenceData, + SequenceOutput) from .core.utils import create_dummy_prompt diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6385d3ca2297e..c53176d10a04e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -25,12 +25,12 @@ from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.usage.usage_lib import UsageContext from vllm.utils import print_warning_once diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 36cb6ce795f3e..0e4a9afff7493 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -29,6 +29,7 @@ from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MultiModalDataDict from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, RequestOutputFactory) @@ -36,9 +37,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest, - PoolerOutput, SamplerOutput, Sequence, - SequenceGroup, SequenceGroupMetadata, - SequenceStatus) + PoolerOutput, Sequence, SequenceGroup, + SequenceGroupMetadata, SequenceStatus) from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, init_tracer) from vllm.transformers_utils.config import try_get_generation_config diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 57cc33d911183..76782888031e3 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -2,7 +2,8 @@ from typing import Sequence as GenericSequence from typing import Union -from vllm.sequence import PoolerOutput, SamplerOutput, SequenceGroupOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import PoolerOutput, SequenceGroupOutput def create_output_by_sequence_group( diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index e05c01fa8d6c3..33200629f5bb4 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -7,11 +7,11 @@ from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptInputs from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.sequence import SamplerOutput @runtime_checkable diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 37d12725bd1e4..21ad43f641685 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -11,8 +11,9 @@ ResultHandler, WorkerMonitor) from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port, get_vllm_instance_id, make_async) from vllm.worker.worker_base import WorkerWrapperBase diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py index 4df54a09e5e8c..e285e8f544960 100644 --- a/vllm/executor/distributed_gpu_executor.py +++ b/vllm/executor/distributed_gpu_executor.py @@ -6,7 +6,8 @@ from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest logger = init_logger(__name__) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 422bef107f352..c96cb0f2c2981 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -6,8 +6,9 @@ PromptAdapterConfig, SchedulerConfig, SpeculativeConfig) from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest class ExecutorBase(ABC): diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 8346c3cc1d3ea..9f6867fa0409a 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -3,8 +3,9 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput +from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) from vllm.worker.worker_base import WorkerBase, WorkerWrapperBase diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 08a35a074b37b..40f5229ef443b 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -14,7 +14,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.triton_utils import maybe_set_triton_cache_manager from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, get_distributed_init_method, get_open_port, diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index b45d5d86b54fa..ef17b35a02089 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -3,7 +3,8 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import make_async logger = init_logger(__name__) diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index 867859d8d3d79..78606e223aa7b 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -9,7 +9,8 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip, get_open_port, make_async) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index aec6998d4488d..c3f3230a0ff70 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -12,7 +12,8 @@ from vllm.executor.msgspec_utils import encode_hook from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py index 7048d47980723..2a1fd35b65797 100644 --- a/vllm/executor/ray_tpu_executor.py +++ b/vllm/executor/ray_tpu_executor.py @@ -10,7 +10,8 @@ from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.executor.tpu_executor import TPUExecutor from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 938f83bc1338b..be4c32d9df2f4 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -14,7 +14,8 @@ from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 253c8abdc1ada..0af8ba41e24d5 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -5,7 +5,8 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 774204dd4612a..bada56068507a 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -9,7 +9,8 @@ from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async from vllm.worker.worker_base import WorkerBase diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 7344d59e988f0..9bdd95a8b8aa5 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -3,11 +3,13 @@ import warnings from importlib.util import find_spec from math import inf -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union +import msgspec import torch import torch.nn as nn +from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics from vllm.triton_utils import HAS_TRITON if HAS_TRITON: @@ -19,8 +21,7 @@ SequenceGroupToSample) from vllm.sampling_params import SamplingType from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - PromptLogprobs, SampleLogprobs, SamplerOutput, - SequenceOutput) + PromptLogprobs, SampleLogprobs, SequenceOutput) if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): import flashinfer.sampling @@ -35,6 +36,107 @@ # (num_token_ids, num_parent_ids) per sequence group. SampleResultType = List[Tuple[List[int], List[int]]] +# Types of temporary data structures used for +# computing sample_results +SampleMetadataType = Dict[SamplingType, Tuple[List[int], + List[SequenceGroupToSample]]] +MultinomialSamplesType = Dict[SamplingType, torch.Tensor] +SampleResultsDictType = Dict[int, Tuple[List[int], List[int]]] + + +# Encapsulates temporary data structures for computing +# sample_result; for multi-step scheduling: must be returned +# by sampler.forward() and used later to compute the pythonized +# sample_result; for single-step scheduling: consumed immediately +# inside sampler-forward() to compute pythonized sample_result. +class SampleResultArgsType: + sample_metadata: SampleMetadataType + sampling_metadata: SamplingMetadata + greedy_samples: Optional[torch.Tensor] = None + multinomial_samples: MultinomialSamplesType + beam_search_logprobs: Optional[torch.Tensor] = None + sample_results_dict: SampleResultsDictType + + +# Union of non-deferred (single-step scheduling) +# vs deferred (multi-step scheduling) +# sample result types +MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType] + +# Shorthand for _sample() +SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] + + +class SamplerOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] + """For each sequence group, we generate a list of SequenceOutput object, + each of which contains one possible candidate for the next token. + + This data structure implements methods, so it can be used like a list, but + also has optional fields for device tensors. + """ + + outputs: List[CompletionSequenceGroupOutput] + + # On-device tensor containing probabilities of each token. + sampled_token_probs: Optional[torch.Tensor] = None + + # On-device tensor containing the logprobs of each token. + logprobs: Optional["torch.Tensor"] = None + + # Holds the pythonized sample result (single-step) + # or arguments for deferred pythonized sample result + # computation (muliti-step) + deferred_sample_results_args: Optional[SampleResultArgsType] = None + + # On-device tensor containing the sampled token ids. + sampled_token_ids: Optional[torch.Tensor] = None + # CPU tensor containing the sampled token ids. Used during multi-step to + # return the sampled token ids from last rank to AsyncLLMEngine to be + # 'broadcasted' to all other PP ranks for next step. + sampled_token_ids_cpu: Optional[torch.Tensor] = None + + # Spec decode metrics populated by workers. + spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None + + # Optional last hidden states from the model. + hidden_states: Optional[torch.Tensor] = None + + # Time taken in the forward pass for this across all workers + model_forward_time: Optional[float] = None + + # Time taken in the model execute function. This will include model forward, + # block/sync across workers, cpu-gpu sync time and sampling time. + model_execute_time: Optional[float] = None + + def __getitem__(self, idx: int): + return self.outputs[idx] + + def __setitem__(self, idx: int, value): + self.outputs[idx] = value + + def __len__(self): + return len(self.outputs) + + def __eq__(self, other: object): + return isinstance(other, + self.__class__) and self.outputs == other.outputs + + def __repr__(self) -> str: + """Show the shape of a tensor instead of its values to reduce noise. + """ + sampled_token_probs_repr = ("None" if self.sampled_token_probs is None + else self.sampled_token_probs.shape) + sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else + self.sampled_token_ids.shape) + return ( + f"SamplerOutput(outputs={self.outputs}, " + f"sampled_token_probs={sampled_token_probs_repr}, " + f"sampled_token_ids={sampled_token_ids_repr}, " + f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") + class Sampler(nn.Module): """Samples the next tokens from the model's outputs. @@ -150,7 +252,7 @@ def forward( logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) # Sample the next tokens. - sample_results, maybe_sampled_tokens_tensor = _sample( + maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample( probs, logprobs, sampling_metadata, @@ -169,11 +271,13 @@ def forward( prompt_logprobs = None sample_logprobs = None if not sampling_metadata.skip_sampler_cpu_output: - prompt_logprobs, sample_logprobs = _get_logprobs( - logprobs, sampling_metadata, sample_results) + assert not isinstance(maybe_deferred_sample_results, + SampleResultArgsType) + prompt_logprobs, sample_logprobs = get_logprobs( + logprobs, sampling_metadata, maybe_deferred_sample_results) return _build_sampler_output( - sample_results, + maybe_deferred_sample_results, sampling_metadata, prompt_logprobs, sample_logprobs, @@ -543,6 +647,63 @@ def _top_k_top_p_multinomial_with_flashinfer( return batch_next_token_ids.view(-1, num_samples) +def get_pythonized_sample_results( + sample_result_args: SampleResultArgsType, ) -> SampleResultType: + ''' + This function consumes GPU-side sampler results and computes + Pythonized CPU-side sampler results (GPU -> CPU sync.) + + Single-step scheduling: this function is invoked at sampling-time + for immediate Pythonization. + + Multi-step scheduling: Pythonization is deferred until after multiple + GPU-side steps have been completed. + + Arguments: + + * sample_result_args: GPU-side inputs to the Pythonization process + + Returns: + + * Pythonized sampler results + ''' + + ( + sample_metadata, + sampling_metadata, + greedy_samples, + multinomial_samples, + beam_search_logprobs, + sample_results_dict, + ) = ( + sample_result_args.sample_metadata, + sample_result_args.sampling_metadata, + sample_result_args.greedy_samples, + sample_result_args.multinomial_samples, + sample_result_args.beam_search_logprobs, + sample_result_args.sample_results_dict, + ) + + for sampling_type in SamplingType: + if sampling_type not in sample_metadata: + continue + (seq_group_id, seq_groups) = sample_metadata[sampling_type] + if sampling_type == SamplingType.GREEDY: + sample_results = _greedy_sample(seq_groups, greedy_samples) + elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): + sample_results = _random_sample(seq_groups, + multinomial_samples[sampling_type]) + elif sampling_type == SamplingType.BEAM: + sample_results = _beam_search_sample(seq_groups, + beam_search_logprobs) + sample_results_dict.update(zip(seq_group_id, sample_results)) + + return [ + sample_results_dict.get(i, ([], [])) + for i in range(len(sampling_metadata.seq_groups)) + ] + + def _sample_with_torch( probs: torch.Tensor, logprobs: torch.Tensor, @@ -550,7 +711,16 @@ def _sample_with_torch( sampling_tensors: SamplingTensors, include_gpu_probs_tensor: bool, modify_greedy_probs: bool, -) -> Tuple[SampleResultType, Optional[torch.Tensor]]: +) -> SampleReturnType: + ''' + Torch-oriented _sample() implementation. + ''' + + # Begin building arguments for computing Pythonized sampler + # results. + maybe_deferred_args = SampleResultArgsType() + maybe_deferred_args.sampling_metadata = sampling_metadata + categorized_seq_group_ids: Dict[SamplingType, List[int]] = {t: [] for t in SamplingType} @@ -560,10 +730,9 @@ def _sample_with_torch( sampling_type = sampling_params.sampling_type categorized_seq_group_ids[sampling_type].append(i) - sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata: Dict[SamplingType, - Tuple[List[int], List[SequenceGroupToSample]]] = {} - multinomial_samples: Dict[SamplingType, torch.Tensor] = {} + sample_results_dict: SampleResultsDictType = {} + sample_metadata: SampleMetadataType = {} + multinomial_samples: MultinomialSamplesType = {} # Create output tensor for sampled token ids. if include_gpu_probs_tensor: @@ -589,6 +758,7 @@ def _sample_with_torch( if sampling_type == SamplingType.GREEDY: greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1) + maybe_deferred_args.greedy_samples = greedy_samples if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. @@ -635,35 +805,27 @@ def _sample_with_torch( elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] + maybe_deferred_args.beam_search_logprobs = beam_search_logprobs else: raise ValueError(f"Unsupported sampling type: {sampling_type}") - # GPU<->CPU sync happens in the loop below. - # This also converts the sample output to Python objects. + maybe_deferred_args.sample_metadata = sample_metadata + maybe_deferred_args.multinomial_samples = multinomial_samples + maybe_deferred_args.sample_results_dict = sample_results_dict + if not sampling_metadata.skip_sampler_cpu_output: - for sampling_type in SamplingType: - if sampling_type not in sample_metadata: - continue - (seq_group_id, seq_groups) = sample_metadata[sampling_type] - if sampling_type == SamplingType.GREEDY: - sample_results = _greedy_sample(seq_groups, greedy_samples) - elif sampling_type in (SamplingType.RANDOM, - SamplingType.RANDOM_SEED): - sample_results = _random_sample( - seq_groups, multinomial_samples[sampling_type]) - elif sampling_type == SamplingType.BEAM: - sample_results = _beam_search_sample(seq_groups, - beam_search_logprobs) - sample_results_dict.update(zip(seq_group_id, sample_results)) - - sample_results = [ - sample_results_dict.get(i, ([], [])) - for i in range(len(sampling_metadata.seq_groups)) - ] + # GPU<->CPU sync happens here. + # This also converts the sample output to Python objects. + # Return Pythonized sampler result & sampled token ids + return get_pythonized_sample_results( + maybe_deferred_args, ), sampled_token_ids_tensor else: - sample_results = [] - - return sample_results, sampled_token_ids_tensor + # Defer sampler result Pythonization; return deferred + # Pythonization args & sampled token ids + return ( + maybe_deferred_args, + sampled_token_ids_tensor, + ) def _sample_with_triton_kernel( @@ -755,7 +917,7 @@ def _sample( sampling_tensors: SamplingTensors, include_gpu_probs_tensor: bool, modify_greedy_probs: bool, -) -> Tuple[SampleResultType, Optional[torch.Tensor]]: +) -> SampleReturnType: """ Args: probs: (num_query_tokens_in_batch, num_vocab) @@ -803,7 +965,7 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: return result.sum(1).add_(1) -def _get_logprobs( +def get_logprobs( logprobs: torch.Tensor, sampling_metadata: SamplingMetadata, sample_results: SampleResultType, @@ -1126,7 +1288,7 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, def _build_sampler_output( - sample_results: SampleResultType, + maybe_deferred_sample_results: MaybeDeferredSampleResultType, sampling_metadata: SamplingMetadata, prompt_logprobs: Optional[List[Optional[PromptLogprobs]]], sample_logprobs: Optional[List[SampleLogprobs]], @@ -1143,14 +1305,21 @@ def _build_sampler_output( speculative decoding rejection sampling. """ sampler_output: List[CompletionSequenceGroupOutput] = [] - if not skip_sampler_cpu_output: + + if skip_sampler_cpu_output: + assert isinstance(maybe_deferred_sample_results, SampleResultArgsType) + deferred_sample_results_args = maybe_deferred_sample_results + else: assert prompt_logprobs is not None assert sample_logprobs is not None + assert not isinstance(maybe_deferred_sample_results, + SampleResultArgsType) + deferred_sample_results_args = None for (seq_group, sample_result, group_prompt_logprobs, group_sample_logprobs) in zip(sampling_metadata.seq_groups, - sample_results, prompt_logprobs, - sample_logprobs): + maybe_deferred_sample_results, + prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result seq_outputs: List[SequenceOutput] = [] @@ -1176,7 +1345,7 @@ def _build_sampler_output( sampled_token_probs=sampled_token_probs, sampled_token_ids=sampled_token_ids, logprobs=logprobs_tensor, - ) + deferred_sample_results_args=deferred_sample_results_args) def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]: diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index 07e23aca6cc5f..750ba188487b4 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -10,9 +10,8 @@ from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplerOutput TORCH_DTYPE_TO_NEURON_AMP = { "auto": "f32", diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 5c522a61732a4..3c1f6fa769894 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -15,9 +15,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import (LogitsProcessor, _prune_hidden_states) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplerOutput logger = init_logger(__name__) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 28f69cfbc46bd..efa044d0b5e92 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -23,13 +23,13 @@ from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig, DeepSpeedFPParameter) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.arctic import ArcticConfig logger = init_logger(__name__) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 73711d8eb5185..bdd76b11384c2 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -38,12 +38,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index f78400b0df7b3..9b4c4be7fcb09 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -34,12 +34,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors logger = logging.get_logger(__name__) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 20dda2a67820d..6c3e4d324000e 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -13,13 +13,13 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.opt import OPTModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SamplerOutput, SequenceData) + SequenceData) from .blip import (BlipVisionModel, dummy_image_for_blip, get_max_blip_image_tokens) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 07ee0e3c531d0..831b3f20457a9 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -34,12 +34,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 788d22db9d5a8..6d2afcf18e396 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -22,7 +22,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -33,7 +33,7 @@ from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SamplerOutput, SequenceData) + SequenceData) from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 4949d0232fabb..35f1ed5ef5d33 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -20,12 +20,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index f63cf246e510a..be7f19d15b623 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -38,14 +38,14 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, row_parallel_weight_loader) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors @torch.compile diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index dca959798e8b2..6160197dc19de 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -17,13 +17,13 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.dbrx import DbrxConfig diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 7a27e1388e987..61cc917ab6207 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -43,12 +43,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class DeepseekMLP(nn.Module): diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c7f3af0ccb266..8cbd9435ec7ca 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -43,12 +43,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 7b97b3d255dfa..b474d35baf89d 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -39,12 +39,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import RWConfig FalconConfig = Union[HF_FalconConfig, RWConfig] diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 2ef23819b69a2..6c0cce3aa555d 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -31,6 +31,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -39,7 +40,7 @@ from vllm.multimodal.image import (cached_get_image_processor, cached_get_tokenizer) from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SamplerOutput, SequenceData) + SequenceData) from .interfaces import SupportsMultiModal from .utils import merge_multimodal_embeddings diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index e1041edf81b0a..36fd389831282 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -34,12 +34,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 5e0f8b70d4b80..90449ec51ef0b 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -33,12 +33,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index bfc231282952a..fb5a297661ddc 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -34,12 +34,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index b93fb8d69b2d7..fe5ec10827608 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -34,12 +34,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 4d52b448049b4..664d775c8ba40 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -33,12 +33,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class GPTJAttention(nn.Module): diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 2adecf7fa9ef8..5f6f1e3880547 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -33,12 +33,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class GPTNeoXAttention(nn.Module): diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 887a353df972c..dd396648f1bdf 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -17,12 +17,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class InternLM2MLP(nn.Module): diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index b379c86c1912b..33829aef1dc50 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -18,13 +18,14 @@ from vllm.config import CacheConfig, MultiModalConfig from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.intern_vit import InternVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.image import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_num_patches) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index a550f7e6c97a1..b0fbb7e9829e0 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -35,12 +35,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import JAISConfig from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index b82eb14fb5f23..dc8912c87f56a 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -26,14 +26,14 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import HasInnerState from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, _get_graph_batch_size) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 0c67a9b8e198b..e55c01316087c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -42,13 +42,13 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.utils import is_hip from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6433ea380cbfe..9e0159ba1f775 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -11,10 +11,11 @@ from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .clip import (CLIPVisionModel, dummy_image_for_clip, dummy_seq_data_for_clip, get_max_clip_image_tokens, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c7cb243fa84da..b7c9a81e10894 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -15,10 +15,11 @@ from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .clip import (CLIPVisionModel, dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_image_feature_size, diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index c2a61ca52011e..183b5704b534e 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -4,11 +4,11 @@ import torch.nn as nn from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.medusa import MedusaConfig diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index ff42bdefe0269..a135118bc748e 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -44,13 +44,13 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 99a3c5dab39e4..8dfa0bb7792c5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -44,7 +44,7 @@ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -57,7 +57,7 @@ from vllm.multimodal.image import (cached_get_image_processor, cached_get_tokenizer) from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SamplerOutput, SequenceData) + SequenceData) from .idefics2_vision_model import Idefics2VisionTransformer diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 34f581ac78582..207f4cdd4ec6b 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -39,13 +39,13 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 8bdd52b343175..68471f6ac77d1 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -42,12 +42,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class MixtralMLP(nn.Module): diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 9b96ecb78a3c9..42ccd01298169 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -6,11 +6,10 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import MLPSpeculatorConfig SQRT2 = 2**0.5 diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 1a8e514a7ae83..0fcbf06e1a060 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -17,12 +17,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.mpt import MPTConfig diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 7d92a1ffe55df..e9ff12de2094e 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -37,13 +37,13 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import NemotronConfig from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 8de124cd034dc..97749725dd132 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -38,12 +38,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class OlmoAttention(nn.Module): diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index c0d2d537e731f..88d2bcb9f0c9d 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -34,12 +34,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class OPTLearnedPositionalEmbedding(nn.Embedding): diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index fab35f0b882a7..b01ce87adfa46 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -21,12 +21,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class OrionMLP(nn.Module): diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 8beb2778fe37a..9fbc9bbf47a05 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -11,13 +11,13 @@ from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.gemma import GemmaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal from .siglip import (SiglipVisionModel, dummy_image_for_siglip, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3300939c7b102..f8fc1cd8ef1f0 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -37,12 +37,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class PersimmonMLP(nn.Module): diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index f31b5162aac96..15c21cfa2d8a8 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -52,12 +52,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index df01bfa3d8e6e..afc6fe9844ad6 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -16,12 +16,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors def load_column_parallel_weight(param: torch.nn.Parameter, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 328f4e6fa827c..8ae502d27eb85 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -30,7 +30,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.clip import CLIPVisionModel @@ -38,7 +38,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, input_processor_for_clip) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b7d017d5f3ea6..8298e3bac4465 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -22,12 +22,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b95987c16ebca..a64e08c422bc3 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -40,13 +40,13 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 6f838947fbf27..56129515ca8d1 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -45,12 +45,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index decbf89d27c7c..6236426dcd4e1 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -36,12 +36,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class StablelmMLP(nn.Module): diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index d1b1d210b727c..d3a3a83c8437f 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -35,12 +35,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors class Starcoder2Attention(nn.Module): diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index c0bafa9367e43..24cc3728f85e4 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -38,12 +38,12 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/sequence.py b/vllm/sequence.py index 206da192193dc..f70d1c46c2a9c 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1060,72 +1060,6 @@ def __repr__(self) -> str: return f"IntermediateTensors(tensors={self.tensors})" -class SamplerOutput( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """For each sequence group, we generate a list of SequenceOutput object, - each of which contains one possible candidate for the next token. - - This data structure implements methods, so it can be used like a list, but - also has optional fields for device tensors. - """ - - outputs: List[CompletionSequenceGroupOutput] - - # On-device tensor containing probabilities of each token. - sampled_token_probs: Optional[torch.Tensor] = None - - # On-device tensor containing the logprobs of each token. - logprobs: Optional["torch.Tensor"] = None - - # On-device tensor containing the sampled token ids. - sampled_token_ids: Optional[torch.Tensor] = None - # CPU tensor containing the sampled token ids. Used during multi-step to - # return the sampled token ids from last rank to AsyncLLMEngine to be - # 'broadcasted' to all other PP ranks for next step. - sampled_token_ids_cpu: Optional[torch.Tensor] = None - - # Spec decode metrics populated by workers. - spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None - - # Optional last hidden states from the model. - hidden_states: Optional[torch.Tensor] = None - - # Time taken in the forward pass for this across all workers - model_forward_time: Optional[float] = None - - # Time taken in the model execute function. This will include model forward, - # block/sync across workers, cpu-gpu sync time and sampling time. - model_execute_time: Optional[float] = None - - def __getitem__(self, idx: int): - return self.outputs[idx] - - def __setitem__(self, idx: int, value): - self.outputs[idx] = value - - def __len__(self): - return len(self.outputs) - - def __eq__(self, other: object): - return isinstance(other, - self.__class__) and self.outputs == other.outputs - - def __repr__(self) -> str: - """Show the shape of a tensor instead of its values to reduce noise. - """ - sampled_token_probs_repr = ("None" if self.sampled_token_probs is None - else self.sampled_token_probs.shape) - sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else - self.sampled_token_ids.shape) - return ( - f"SamplerOutput(outputs={self.outputs}, " - f"sampled_token_probs={sampled_token_probs_repr}, " - f"sampled_token_ids={sampled_token_ids_repr}, " - f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") - - class PoolerOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index ad6f3f313841d..2785d30047240 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -5,8 +5,9 @@ import torch from vllm import SamplingParams +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, ExecuteModelRequest, - SamplerOutput, SequenceData, SequenceGroupMetadata, + SequenceData, SequenceGroupMetadata, get_all_seq_ids) from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 053e9203e01eb..b89561fc8f656 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -3,6 +3,7 @@ import torch from vllm import _custom_ops as ops +from vllm.model_executor.layers.sampler import SamplerOutput try: from vllm.attention.backends.flash_attn import FlashAttentionMetadata @@ -16,8 +17,7 @@ PromptAdapterConfig, SchedulerConfig) from vllm.logger import init_logger from vllm.multimodal import MultiModalInputs -from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, - SamplerOutput) +from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, ModelRunner) diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index d1809e49c2a8f..0d233f393cb8c 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -4,8 +4,8 @@ import torch from vllm.model_executor import SamplingMetadata -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index 76e444387816f..fc41bb82ea340 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -3,8 +3,8 @@ import torch from vllm.model_executor import SamplingMetadata -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 65bfb5dc8d5c6..63c6db6573e6d 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -4,7 +4,8 @@ import torch -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import (ExecuteModelRequest, SequenceData, SequenceGroupMetadata) from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner from vllm.spec_decode.interfaces import (SpeculativeProposals, diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 806480b5c892f..36e5e1774aa0d 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -3,7 +3,8 @@ import torch -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index efb8ee25ba2f9..28a537593f26d 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.interfaces import SpeculativeProposer from vllm.worker.worker_base import LoraNotSupportedWorkerBase diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 215ede52fb812..8896b7dbc6b8a 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -6,7 +6,8 @@ init_model_parallel_group, patch_tensor_parallel_group) from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index acf77a7349eef..2d1925d04a765 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -8,12 +8,13 @@ from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.spec_decode_base_sampler import ( SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler) from vllm.model_executor.layers.typical_acceptance_sampler import ( TypicalAcceptanceSampler) from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest, - HiddenStates, SamplerOutput, SequenceGroupMetadata, + HiddenStates, SequenceGroupMetadata, get_all_seq_ids, get_all_seq_ids_and_request_ids) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 28f7f7eb069ab..91cc7791b482a 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -2,8 +2,8 @@ import torch -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index b85f2a6f70ac0..8606337bc65a2 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -4,9 +4,9 @@ import torch +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SamplerOutput, SequenceGroupMetadata, - SequenceOutput) + SequenceGroupMetadata, SequenceOutput) SeqId = int diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 82e69b569d90d..e6aaba5958722 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -10,11 +10,11 @@ SchedulerConfig) from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs) -from vllm.sequence import (IntermediateTensors, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 5c700229660c0..d6189d82d51d9 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -16,9 +16,10 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.sampling_params import SamplingParams -from vllm.sequence import (IntermediateTensors, PoolerOutput, SamplerOutput, +from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceGroupMetadata) from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad from vllm.worker.model_runner import (GPUModelRunnerBase, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 793f03456e997..31c39503dfe3d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -28,6 +28,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata, SamplingMetadataCache +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models.interfaces import (supports_lora, @@ -40,8 +41,7 @@ from vllm.prompt_adapter.worker_manager import ( LRUCacheWorkerPromptAdapterManager) from vllm.sampling_params import SamplingParams -from vllm.sequence import (IntermediateTensors, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import (CudaMemoryProfiler, PyObjectCache, async_tensor_h2d, flatten_2d_lists, is_hip, is_pin_memory_available) from vllm.worker.model_runner_base import ( diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 90c39407d7266..f8fd9d801d289 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -5,9 +5,9 @@ import torch +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform -from vllm.sequence import (IntermediateTensors, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata if TYPE_CHECKING: from vllm.attention import AttentionMetadata diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index de3de95e16881..8ec660d30ee3f 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union try: from vllm.attention.backends.flash_attn import FlashAttentionMetadata @@ -10,13 +10,15 @@ import torch -from vllm.model_executor.layers.sampler import _get_logprobs from vllm import _custom_ops as ops from vllm.distributed import get_pp_group from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs, + SamplerOutput, + SamplingMetadata, get_logprobs, + get_pythonized_sample_results) from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - Logprob, SamplerOutput, SequenceGroupMetadata, - SequenceOutput) + SequenceGroupMetadata, SequenceOutput) from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( @@ -296,7 +298,7 @@ def execute_model( model_input.cached_outputs.append( ModelOutput(output[0], output_ready_event, output[0].sampled_token_ids, False)) - + # Pythonize the output if CPU is ahead and the previous step is # ready. for model_output in model_input.cached_outputs: @@ -412,6 +414,38 @@ def vocab_size(self) -> int: return self._base_model_runner.vocab_size +DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]], + Optional[List[SampleLogprobs]]] + + +def _maybe_deferred_pythonize_logprobs( + skip_sampler_cpu_output: bool, output: SamplerOutput, + sampling_metadata: SamplingMetadata) -> DeferredLogprobsReturnType: + if skip_sampler_cpu_output: + # Perform deferred logprob Pythonization + + # - Deferred pythonized sample result computation + sample_result = get_pythonized_sample_results( + output.deferred_sample_results_args) + + # - Erase the CUDA-side deferred sample_result + # computation args + output.deferred_sample_results_args = None + + # - Compute logprobs + ( + prompt_logprobs, + sample_logprobs, + ) = get_logprobs(output.logprobs, sampling_metadata, sample_result) + + assert len(prompt_logprobs) == len(sampling_metadata.seq_groups) + assert len(sample_logprobs) == len(sampling_metadata.seq_groups) + + return prompt_logprobs, sample_logprobs + + return None, None + + def _pythonize_sampler_output(model_input: StatefulModelInput, output: SamplerOutput, pinned_sampled_token_buffer: torch.Tensor, @@ -437,8 +471,33 @@ def _pythonize_sampler_output(model_input: StatefulModelInput, sampling_metadata = frozen_model_input.sampling_metadata - for (seq_group, sample_result) in zip(sampling_metadata.seq_groups, - samples_list): + skip_sampler_cpu_output = ( + frozen_model_input.sampling_metadata.skip_sampler_cpu_output) + + ( + prompt_logprobs, + sample_logprobs, + ) = _maybe_deferred_pythonize_logprobs(skip_sampler_cpu_output, output, + sampling_metadata) + + for sgdx, (seq_group, sample_result) in enumerate( + zip(sampling_metadata.seq_groups, samples_list)): + + if skip_sampler_cpu_output: + assert prompt_logprobs is not None + assert sample_logprobs is not None + + ( + group_prompt_logprobs, + group_sample_logprobs, + ) = ( # Utilize deferred pythonization results + prompt_logprobs[sgdx], + sample_logprobs[sgdx], + ) if skip_sampler_cpu_output else ( + # profile_run: use already-computed logprobs + output.outputs[sgdx].prompt_logprobs, + [sample.logprobs for sample in output.outputs[sgdx].samples]) + seq_ids = seq_group.seq_ids next_token_ids = sample_result parent_ids = [0] @@ -446,11 +505,11 @@ def _pythonize_sampler_output(model_input: StatefulModelInput, if seq_group.sampling_params.logits_processors: assert len(seq_group.sampling_params.logits_processors) == 0, ( "Logits Processors are not supported in multi-step decoding") - for parent_id, next_token_id in zip(parent_ids, next_token_ids): - # TODO(will): support logprobs - # Hard coded logprob + for (parent_id, next_token_id, + logprobs) in zip(parent_ids, next_token_ids, + group_sample_logprobs): seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, - {next_token_id: Logprob(logprob=-1)})) - output.outputs.append(CompletionSequenceGroupOutput(seq_outputs, None)) + SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) + output.outputs.append( + CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs)) assert len(output.outputs) > 0 diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index 6a6caba9371eb..08535ef2142cf 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -2,7 +2,8 @@ from typing import List, Optional, Tuple from vllm.distributed import broadcast_tensor_dict, get_pp_group -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.worker.model_runner_base import BroadcastableModelInput from vllm.worker.multi_step_model_runner import (MultiStepModelRunner, StatefulModelInput) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 4f3fed2dbd723..f3defffdfa520 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -8,11 +8,11 @@ SchedulerConfig) from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs) -from vllm.sequence import (IntermediateTensors, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index a1d09a2f9e53e..f335e4e32efd4 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -11,10 +11,11 @@ SchedulerConfig) from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.openvino import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs) -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import SequenceGroupMetadata logger = init_logger(__name__) diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index c47f9acc4423d..36339e175d7bb 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -14,7 +14,8 @@ init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.worker.openvino_model_runner import OpenVINOModelRunner from vllm.worker.worker_base import LoraNotSupportedWorkerBase diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 01daa64b5a32f..72cdc591ed2e8 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -13,11 +13,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - Logprob, SamplerOutput, SequenceGroupMetadata, - SequenceOutput) + Logprob, SequenceGroupMetadata, SequenceOutput) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 97be68934be46..a184093b0df35 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -15,12 +15,12 @@ set_custom_all_reduce) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, - SamplerOutput, SequenceGroupMetadata, - SequenceGroupMetadataDelta) + SequenceGroupMetadata, SequenceGroupMetadataDelta) from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 9fddc863548eb..5a34619ee44ca 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -11,9 +11,9 @@ from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform -from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, - SamplerOutput) +from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import (enable_trace_function_call_for_thread, update_environment_variables) from vllm.worker.model_runner_base import (BroadcastableModelInput, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 0bfc57a1c57de..e9d5683f7cea2 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -11,12 +11,12 @@ from vllm.distributed import broadcast_tensor_dict from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs, MultiModalRegistry) from vllm.sampling_params import SamplingParams -from vllm.sequence import (IntermediateTensors, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata from vllm.worker.model_runner_base import ( From ed972885deab1d506dc5a9f790a3520e945a9be5 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 14:52:58 -0400 Subject: [PATCH 04/47] passing test; dataclass --- examples/offline_inference_multi_step.py | 18 ++- tests/multi_step/test_correctness.py | 144 +++++++++++++++++++++-- tests/utils.py | 2 +- vllm/model_executor/layers/sampler.py | 34 +++--- vllm/worker/multi_step_model_runner.py | 73 ++++++++---- 5 files changed, 209 insertions(+), 62 deletions(-) diff --git a/examples/offline_inference_multi_step.py b/examples/offline_inference_multi_step.py index 643b53875e0ed..5c603a8eca2a4 100644 --- a/examples/offline_inference_multi_step.py +++ b/examples/offline_inference_multi_step.py @@ -15,18 +15,16 @@ "The future of AI is", ] # Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, logprobs=5) # Create an LLM. -llm = LLM( - model="JackFram/llama-160m", - swap_space=16, - tensor_parallel_size=1, - gpu_memory_utilization=0.9, - num_scheduler_steps=8, - use_v2_block_manager=True, - enforce_eager=True, -) +llm = LLM(model="JackFram/llama-160m", + swap_space=16, + tensor_parallel_size=1, + gpu_memory_utilization=0.9, + num_scheduler_steps=8, + use_v2_block_manager=True, + enforce_eager=True) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index bc14311c66424..79b7f95650a20 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -1,8 +1,9 @@ # Test the AsyncLLMEngine with multi-step-decoding -from typing import List +from typing import Dict, List, Optional import pytest +from openai.types.completion import Completion from ..utils import RemoteOpenAIServer @@ -22,9 +23,29 @@ "16", ] +NUM_LOGPROBS = [None, 5] # `logprobs` argument to OpenAI completions API -async def completions_with_server_args(prompts: List[str], model_name: str, - server_cli_args: List[str]): + +async def completions_with_server_args( + prompts: List[str], + model_name: str, + server_cli_args: List[str], + num_logprobs: Optional[int], +) -> Completion: + ''' + Construct a remote OpenAI server, obtain an async client to the + server & invoke the completions API to obtain completions. + + Arguments: + + * prompts: test prompts + * model_name: model to spin up on the vLLM server + * server_cli_args: CLI args for starting the server + + Returns: + + * OpenAI Completion instance + ''' outputs = None with RemoteOpenAIServer(model_name, server_cli_args) as server: @@ -33,12 +54,78 @@ async def completions_with_server_args(prompts: List[str], model_name: str, prompt=prompts, temperature=0, stream=False, - max_tokens=5) + max_tokens=5, + logprobs=num_logprobs) assert outputs is not None return outputs +def get_text_generations(completions: Completion): + '''Obtain generated tokens''' + return [x.text for x in completions.choices] + + +''' +Logprobs values are extracted as List[List[Dict[str,float]]], i.e.: +* For each `SequenceGroup`, +* for each token offset in a sequence, +* a mapping from str(token) -> logprob +''' +LogprobType = List[List[Dict[str, float]]] + + +def get_logprob_generations(completions: Completion) -> LogprobType: + '''Obtain top-rank logprobs for each token in each `SequenceGroup`''' + return [x.logprobs.top_logprobs for x in completions.choices] + + +def assert_all_close_logprobs( + ref_logprobs: LogprobType, + test_logprobs: LogprobType, + atol: float = 1e-3, + rtol: float = 1e-3, +) -> None: + ''' + Asserts that logprobs produced by the vLLM engine instance under test + are very close to a set of ground-truth reference values. + + Each individual reference logprob must be close to the test logprob, + according to the formula + + assert abs(tok_top_test_logprob - + tok_top_ref_logprob) <= (atol + + rtol * abs( + tok_top_ref_logprob)) + + Arguments: + + * ref_logprobs: ground-truth logprobs + * test_logprobs: logprobs produced by vLLM engine under test + * atol: absolute mismatch tolerance when comparing single logprobs + * rtol: relative mismatch tolerance when comparing single logprobs + ''' + + assert len(ref_logprobs) == len(test_logprobs), ( + "Reference & test logprob SequenceGroup counts must match.") + + for (group_ref_logprobs, + group_test_logprobs) in zip(ref_logprobs, test_logprobs): + assert len(group_ref_logprobs) == len(group_test_logprobs), ( + "Reference & test logprob seq lens must match.") + for (token_ref_logprobs, + token_test_logprobs) in zip(group_ref_logprobs, + group_test_logprobs): + assert token_ref_logprobs.keys() == token_test_logprobs.keys(), ( + "Reference & test top-logprob token sets must match.") + for (tok_str_ref, + tok_top_ref_logprob) in token_ref_logprobs.items(): + tok_top_test_logprob = token_test_logprobs[tok_str_ref] + + assert abs(tok_top_test_logprob - tok_top_ref_logprob) <= ( + atol + rtol * abs(tok_top_ref_logprob)) + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize(("tp_size, pp_size"), [ (1, 1), @@ -47,10 +134,37 @@ async def completions_with_server_args(prompts: List[str], model_name: str, @pytest.mark.parametrize("eager_mode", [False, True]) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) +@pytest.mark.parametrize("num_logprobs", NUM_LOGPROBS) @pytest.mark.asyncio async def test_multi_step(example_prompts, model: str, tp_size: int, pp_size: int, eager_mode: int, - num_scheduler_steps: int, num_prompts: int): + num_scheduler_steps: int, num_prompts: int, + num_logprobs: Optional[int]): + ''' + Test vLLM engine with multi-step scheduling in an OpenAI-protocol + client/server environment. + + Set up an engine with single-step scheduling as a ground-truth reference. + + Send a completions API request to both engines with the same prompts. + + Validate: + * Generated tokens match + * Generated logprobs are all very close + + Arguments: + + * example_prompts: test fixture providing example prompts + * model: model under test (same for single- and multi-step engines) + * tp_size: degree of tensor-parallelism + * pp_size: degree of pipeline-parallelism + * eager_mode + * num_scheduler_steps: for multi-step scheduling, GPU-side steps per + GPU -> CPU output transfer + * num_prompts: number of example prompts under test + * num_logprobs: corresponds to the `logprobs` argument to the OpenAI + completions endpoint; `None` -> no logprobs + ''' prompts = example_prompts if len(prompts) < num_prompts: @@ -73,13 +187,23 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, ] ref_completions = await completions_with_server_args( - prompts, model, server_args + distributed_args) + prompts, model, server_args + distributed_args, num_logprobs) test_completions = await completions_with_server_args( - prompts, model, ms_server_args + distributed_args) - - def get_text_generations(completions): - return [x.text for x in completions.choices] + prompts, model, ms_server_args + distributed_args, num_logprobs) + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. ref_generations = get_text_generations(ref_completions) test_generations = get_text_generations(test_completions) assert ref_generations == test_generations + + # Assert multi-step scheduling produces identical logprobs + # to single-step scheduling. + ref_logprobs = get_logprob_generations(ref_completions) + test_logprobs = get_logprob_generations(test_completions) + assert_all_close_logprobs( + ref_logprobs, + test_logprobs, + atol=1e-5, + rtol=1e-5, + ) diff --git a/tests/utils.py b/tests/utils.py index 3e0124fa11352..96ce10c669f7f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -133,7 +133,7 @@ def get_client(self): api_key=self.DUMMY_API_KEY, ) - def get_async_client(self): + def get_async_client(self) -> openai.AsyncOpenAI: return openai.AsyncOpenAI( base_url=self.url_for("v1"), api_key=self.DUMMY_API_KEY, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 9bdd95a8b8aa5..4ed855176a46f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -15,6 +15,8 @@ if HAS_TRITON: from vllm.model_executor.layers.ops.sample import sample as sample_triton +from dataclasses import dataclass + import vllm.envs as envs from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingTensors, @@ -49,13 +51,14 @@ # by sampler.forward() and used later to compute the pythonized # sample_result; for single-step scheduling: consumed immediately # inside sampler-forward() to compute pythonized sample_result. +@dataclass class SampleResultArgsType: sample_metadata: SampleMetadataType - sampling_metadata: SamplingMetadata - greedy_samples: Optional[torch.Tensor] = None multinomial_samples: MultinomialSamplesType - beam_search_logprobs: Optional[torch.Tensor] = None sample_results_dict: SampleResultsDictType + sampling_metadata: SamplingMetadata + greedy_samples: Optional[torch.Tensor] + beam_search_logprobs: Optional[torch.Tensor] # Union of non-deferred (single-step scheduling) @@ -648,7 +651,7 @@ def _top_k_top_p_multinomial_with_flashinfer( def get_pythonized_sample_results( - sample_result_args: SampleResultArgsType, ) -> SampleResultType: + sample_result_args: SampleResultArgsType) -> SampleResultType: ''' This function consumes GPU-side sampler results and computes Pythonized CPU-side sampler results (GPU -> CPU sync.) @@ -716,11 +719,6 @@ def _sample_with_torch( Torch-oriented _sample() implementation. ''' - # Begin building arguments for computing Pythonized sampler - # results. - maybe_deferred_args = SampleResultArgsType() - maybe_deferred_args.sampling_metadata = sampling_metadata - categorized_seq_group_ids: Dict[SamplingType, List[int]] = {t: [] for t in SamplingType} @@ -733,6 +731,8 @@ def _sample_with_torch( sample_results_dict: SampleResultsDictType = {} sample_metadata: SampleMetadataType = {} multinomial_samples: MultinomialSamplesType = {} + greedy_samples: Optional[torch.Tensor] = None + beam_search_logprobs: Optional[torch.Tensor] = None # Create output tensor for sampled token ids. if include_gpu_probs_tensor: @@ -758,7 +758,6 @@ def _sample_with_torch( if sampling_type == SamplingType.GREEDY: greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1) - maybe_deferred_args.greedy_samples = greedy_samples if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. @@ -805,20 +804,25 @@ def _sample_with_torch( elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] - maybe_deferred_args.beam_search_logprobs = beam_search_logprobs else: raise ValueError(f"Unsupported sampling type: {sampling_type}") - maybe_deferred_args.sample_metadata = sample_metadata - maybe_deferred_args.multinomial_samples = multinomial_samples - maybe_deferred_args.sample_results_dict = sample_results_dict + # Begin building arguments for computing Pythonized sampler + # results. + maybe_deferred_args = SampleResultArgsType( + sampling_metadata=sampling_metadata, + sample_metadata=sample_metadata, + multinomial_samples=multinomial_samples, + greedy_samples=greedy_samples, + beam_search_logprobs=beam_search_logprobs, + sample_results_dict=sample_results_dict) if not sampling_metadata.skip_sampler_cpu_output: # GPU<->CPU sync happens here. # This also converts the sample output to Python objects. # Return Pythonized sampler result & sampled token ids return get_pythonized_sample_results( - maybe_deferred_args, ), sampled_token_ids_tensor + maybe_deferred_args), sampled_token_ids_tensor else: # Defer sampler result Pythonization; return deferred # Pythonization args & sampled token ids diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 8ec660d30ee3f..6bc4c1bd31d22 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -299,17 +299,18 @@ def execute_model( ModelOutput(output[0], output_ready_event, output[0].sampled_token_ids, False)) + # These GPU tensors are not required by multi-step; + # erase them to ensure they are not pythonized or + # transferred to CPU + output[0].sampled_token_ids = None + output[0].sampled_token_probs = None + # Pythonize the output if CPU is ahead and the previous step is # ready. for model_output in model_input.cached_outputs: model_output.maybe_pythonize(model_input, self._copy_stream, self.pinned_sampled_token_ids) - # make sure we dont try to serialize any GPU tensors - output[0].sampled_token_ids = None - output[0].sampled_token_probs = None - output[0].logprobs = None - model_input.current_step += 1 if not get_pp_group().is_last_rank: @@ -418,32 +419,50 @@ def vocab_size(self) -> int: Optional[List[SampleLogprobs]]] -def _maybe_deferred_pythonize_logprobs( - skip_sampler_cpu_output: bool, output: SamplerOutput, +def deferred_pythonize_logprobs( + output: SamplerOutput, sampling_metadata: SamplingMetadata) -> DeferredLogprobsReturnType: - if skip_sampler_cpu_output: - # Perform deferred logprob Pythonization + ''' + Perform deferred logprob Pythonization. - # - Deferred pythonized sample result computation - sample_result = get_pythonized_sample_results( - output.deferred_sample_results_args) + 1. Pythonize GPU-side output.deferred_sample_results_args + tensors into CPU-side sampler result. + 2. Pythonize GPU-side output.logprobs tensor into + CPU-side logprobs lists, utilizing CPU-side + sampler result for the computation. + + Arguments: - # - Erase the CUDA-side deferred sample_result - # computation args - output.deferred_sample_results_args = None + * output: sampler output (under deferred Pythonization) + * sampling_metadata - # - Compute logprobs - ( - prompt_logprobs, - sample_logprobs, - ) = get_logprobs(output.logprobs, sampling_metadata, sample_result) + Returns: - assert len(prompt_logprobs) == len(sampling_metadata.seq_groups) - assert len(sample_logprobs) == len(sampling_metadata.seq_groups) + * prompt_logprobs (CPU) + * sample_logprobs (CPU) + ''' - return prompt_logprobs, sample_logprobs + # - Deferred pythonized sample result computation + sampler_result = get_pythonized_sample_results( + output.deferred_sample_results_args) + + # - Erase the CUDA-side deferred sample_result + # computation args to ensure it is never + # pythonized or transferred to CPU + output.deferred_sample_results_args = None + + # - Compute logprobs + ( + prompt_logprobs, + sample_logprobs, + ) = get_logprobs(output.logprobs, sampling_metadata, sampler_result) + assert len(prompt_logprobs) == len(sampling_metadata.seq_groups) + assert len(sample_logprobs) == len(sampling_metadata.seq_groups) - return None, None + # Erase the logprobs GPU tensor to ensure it is never pythonized + # or transferred to CPU + output.logprobs = None + return prompt_logprobs, sample_logprobs def _pythonize_sampler_output(model_input: StatefulModelInput, @@ -474,11 +493,13 @@ def _pythonize_sampler_output(model_input: StatefulModelInput, skip_sampler_cpu_output = ( frozen_model_input.sampling_metadata.skip_sampler_cpu_output) + # We are guaranteed output tensors are ready, so it is safe to + # pythonize the sampler output & obtain CPU-side logprobs ( prompt_logprobs, sample_logprobs, - ) = _maybe_deferred_pythonize_logprobs(skip_sampler_cpu_output, output, - sampling_metadata) + ) = (deferred_pythonize_logprobs(output, sampling_metadata) + if skip_sampler_cpu_output else (None, None)) for sgdx, (seq_group, sample_result) in enumerate( zip(sampling_metadata.seq_groups, samples_list)): From 861e1b9f061e994f586366f7f244a01f2ba029fd Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 15:15:45 -0400 Subject: [PATCH 05/47] refactoring --- .buildkite/test-pipeline.yaml | 1 + examples/offline_inference_multi_step.py | 2 +- tests/multi_step/test_correctness.py | 25 ++++++++++++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 59d7241bd452d..b735892328e99 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -155,6 +155,7 @@ steps: - python3 offline_inference_vision_language.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py + - python3 offline_inference_multi_step.py - label: Models Test # 1hr10min source_file_dependencies: diff --git a/examples/offline_inference_multi_step.py b/examples/offline_inference_multi_step.py index 5c603a8eca2a4..ee00eadf1a977 100644 --- a/examples/offline_inference_multi_step.py +++ b/examples/offline_inference_multi_step.py @@ -15,7 +15,7 @@ "The future of AI is", ] # Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95, logprobs=5) +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. llm = LLM(model="JackFram/llama-160m", diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index 79b7f95650a20..b4518ddb32966 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -68,16 +68,17 @@ def get_text_generations(completions: Completion): ''' Logprobs values are extracted as List[List[Dict[str,float]]], i.e.: -* For each `SequenceGroup`, +* For each :class:`SequenceGroup`, * for each token offset in a sequence, * a mapping from str(token) -> logprob ''' -LogprobType = List[List[Dict[str, float]]] +LogprobType = List[Optional[List[Dict[str, float]]]] def get_logprob_generations(completions: Completion) -> LogprobType: - '''Obtain top-rank logprobs for each token in each `SequenceGroup`''' - return [x.logprobs.top_logprobs for x in completions.choices] + '''Obtain top-rank logprobs for each token in each :class:`SequenceGroup`''' + return [(None if x.logprobs is None else x.logprobs.top_logprobs) + for x in completions.choices] def assert_all_close_logprobs( @@ -109,10 +110,26 @@ def assert_all_close_logprobs( assert len(ref_logprobs) == len(test_logprobs), ( "Reference & test logprob SequenceGroup counts must match.") + if ref_logprobs[0] is None: + # It is expected that if one :class:`SequenceGroup` has + # `None` logprobs, then all :class:`SequenceGroup`s + # in the reference list have `None` logprobs. + # Validate this. + assert all([x is None for x in ref_logprobs]) + + # Next, assert that this is also true for + # test logprobs. + assert all([x is None for x in test_logprobs]) + return + for (group_ref_logprobs, group_test_logprobs) in zip(ref_logprobs, test_logprobs): + + assert group_ref_logprobs is not None + assert group_test_logprobs is not None assert len(group_ref_logprobs) == len(group_test_logprobs), ( "Reference & test logprob seq lens must match.") + for (token_ref_logprobs, token_test_logprobs) in zip(group_ref_logprobs, group_test_logprobs): From a34d1acf0e163e8cfca048fbf1fb5e61160c10e4 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 16:05:47 -0400 Subject: [PATCH 06/47] refactoring --- tests/multi_step/test_correctness.py | 24 +++++++---- tests/utils.py | 2 +- vllm/model_executor/layers/sampler.py | 57 ++++++++++++++++++++------ vllm/worker/multi_step_model_runner.py | 29 ++++++++----- 4 files changed, 81 insertions(+), 31 deletions(-) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index b4518ddb32966..6e3d7af8fb72d 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -67,10 +67,13 @@ def get_text_generations(completions: Completion): ''' -Logprobs values are extracted as List[List[Dict[str,float]]], i.e.: -* For each :class:`SequenceGroup`, -* for each token offset in a sequence, -* a mapping from str(token) -> logprob +Logprobs values are extracted as List[Optional[List[Dict[str,float]]]], i.e.: +* For each :class:`SequenceGroup`... +* ...if the completions API was invoked with a non-`None` `logprobs` argument: + * ...for each token offset in a sequence... + * ...store a mapping from str(token) -> logprob +* ...else, if the completions API was invoked with `logprobs=None`: + * ...store None ''' LogprobType = List[Optional[List[Dict[str, float]]]] @@ -91,14 +94,18 @@ def assert_all_close_logprobs( Asserts that logprobs produced by the vLLM engine instance under test are very close to a set of ground-truth reference values. - Each individual reference logprob must be close to the test logprob, - according to the formula + If the completions API was invoked with a non-`None` `logprobs` argument, + then each individual reference logprob must be close to the test logprob, + according to the formula: assert abs(tok_top_test_logprob - tok_top_ref_logprob) <= (atol + rtol * abs( tok_top_ref_logprob)) + Else, if the completions API was invoked with `logprobs=None`, then + both the reference & test log probs should be List[None]. + Arguments: * ref_logprobs: ground-truth logprobs @@ -134,11 +141,12 @@ def assert_all_close_logprobs( token_test_logprobs) in zip(group_ref_logprobs, group_test_logprobs): assert token_ref_logprobs.keys() == token_test_logprobs.keys(), ( - "Reference & test top-logprob token sets must match.") + "Reference & test top tokens must match.") for (tok_str_ref, tok_top_ref_logprob) in token_ref_logprobs.items(): tok_top_test_logprob = token_test_logprobs[tok_str_ref] + # Validate logprobs are numerically very close assert abs(tok_top_test_logprob - tok_top_ref_logprob) <= ( atol + rtol * abs(tok_top_ref_logprob)) @@ -214,7 +222,7 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, test_generations = get_text_generations(test_completions) assert ref_generations == test_generations - # Assert multi-step scheduling produces identical logprobs + # Assert multi-step scheduling produces nearly-identical logprobs # to single-step scheduling. ref_logprobs = get_logprob_generations(ref_completions) test_logprobs = get_logprob_generations(test_completions) diff --git a/tests/utils.py b/tests/utils.py index 96ce10c669f7f..3e0124fa11352 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -133,7 +133,7 @@ def get_client(self): api_key=self.DUMMY_API_KEY, ) - def get_async_client(self) -> openai.AsyncOpenAI: + def get_async_client(self): return openai.AsyncOpenAI( base_url=self.url_for("v1"), api_key=self.DUMMY_API_KEY, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 4ed855176a46f..c7aa0c20c4193 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -39,7 +39,7 @@ SampleResultType = List[Tuple[List[int], List[int]]] # Types of temporary data structures used for -# computing sample_results +# computing sample_result SampleMetadataType = Dict[SamplingType, Tuple[List[int], List[SequenceGroupToSample]]] MultinomialSamplesType = Dict[SamplingType, torch.Tensor] @@ -47,10 +47,14 @@ # Encapsulates temporary data structures for computing -# sample_result; for multi-step scheduling: must be returned -# by sampler.forward() and used later to compute the pythonized -# sample_result; for single-step scheduling: consumed immediately -# inside sampler-forward() to compute pythonized sample_result. +# sample_result. +# +# * For multi-step scheduling: must be returned +# by `Sampler.forward()` and used later to compute the pythonized +# sample_result +# +# * For single-step scheduling: consumed immediately +# inside `Sampler.forward()` to compute pythonized sample_result. @dataclass class SampleResultArgsType: sample_metadata: SampleMetadataType @@ -66,7 +70,7 @@ class SampleResultArgsType: # sample result types MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType] -# Shorthand for _sample() +# Abbreviation of the _sample() return type SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] @@ -89,9 +93,9 @@ class SamplerOutput( # On-device tensor containing the logprobs of each token. logprobs: Optional["torch.Tensor"] = None - # Holds the pythonized sample result (single-step) - # or arguments for deferred pythonized sample result - # computation (muliti-step) + # Holds either (1) the pythonized sampler result (single-step scheduling) + # or (2) what will be arguments for later deferred pythonization of the + # sampler result (muliti-step scheduling) deferred_sample_results_args: Optional[SampleResultArgsType] = None # On-device tensor containing the sampled token ids. @@ -203,6 +207,19 @@ def forward( sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: """ + Single-step scheduling: + * Perform GPU-side sampling computation & compute + GPU-side logprobs tensor + * Pythonize sampling result & logprobs tensor + + Multi-step scheduling: + * Perform GPU-side sampling computation & compute + GPU-side logprobs tensor + * Defer Pythonization of sampling result & logprobs + tensor + * Encapsulate arguments required for deferred Pythonization + in the :class:`SamplerOutput` structure + Args: logits: (num_tokens, vocab_size). sampling_metadata: Metadata for sampling. @@ -265,15 +282,22 @@ def forward( ) if self.include_gpu_probs_tensor: + # Since we will defer sampler result Pythonization, + # preserve GPU-side tensors in support of later + # deferred pythonization of logprobs assert maybe_sampled_tokens_tensor is not None on_device_tensors = (probs, logprobs, maybe_sampled_tokens_tensor) else: + # Since Pythonization has already happened, don't preserve + # GPU-side tensors. on_device_tensors = None # Get the logprobs query results. prompt_logprobs = None sample_logprobs = None if not sampling_metadata.skip_sampler_cpu_output: + # Pythonize logprobs now (GPU -> CPU); do not + # defer. assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType) prompt_logprobs, sample_logprobs = get_logprobs( @@ -717,6 +741,15 @@ def _sample_with_torch( ) -> SampleReturnType: ''' Torch-oriented _sample() implementation. + + Single-step scheduling: + * Perform GPU-side sampling computation + * Immediately Pythonize sampling result + + Multi-step scheduling: + * Perform GPU-side sampling computation + * Defer Pythonization & preserve GPU-side + tensors required for Pythonization ''' categorized_seq_group_ids: Dict[SamplingType, @@ -807,8 +840,8 @@ def _sample_with_torch( else: raise ValueError(f"Unsupported sampling type: {sampling_type}") - # Begin building arguments for computing Pythonized sampler - # results. + # Encapsulate arguments for computing Pythonized sampler + # results, whether deferred or otherwise. maybe_deferred_args = SampleResultArgsType( sampling_metadata=sampling_metadata, sample_metadata=sample_metadata, @@ -819,7 +852,7 @@ def _sample_with_torch( if not sampling_metadata.skip_sampler_cpu_output: # GPU<->CPU sync happens here. - # This also converts the sample output to Python objects. + # This also converts the sampler output to a Python object. # Return Pythonized sampler result & sampled token ids return get_pythonized_sample_results( maybe_deferred_args), sampled_token_ids_tensor diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 6bc4c1bd31d22..e39ca59f3ae3e 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -420,17 +420,23 @@ def vocab_size(self) -> int: def deferred_pythonize_logprobs( - output: SamplerOutput, - sampling_metadata: SamplingMetadata) -> DeferredLogprobsReturnType: + output: SamplerOutput, + sampling_metadata: SamplingMetadata, +) -> DeferredLogprobsReturnType: ''' Perform deferred logprob Pythonization. - 1. Pythonize GPU-side output.deferred_sample_results_args + 1. Pythonize GPU-side sampler result tensors into CPU-side sampler result. - 2. Pythonize GPU-side output.logprobs tensor into - CPU-side logprobs lists, utilizing CPU-side - sampler result for the computation. + 2. Pythonize GPU-side logprobs tensor into + CPU-side logprobs lists, utilizing + the Pythonized sampler result computed + in step 1. + These deferred computations are not required for + single-step scheduling or the `profile_run()` + phase of multi-step scheduling. + Arguments: * output: sampler output (under deferred Pythonization) @@ -442,16 +448,16 @@ def deferred_pythonize_logprobs( * sample_logprobs (CPU) ''' - # - Deferred pythonized sample result computation + # - Deferred pythonization of sample result sampler_result = get_pythonized_sample_results( output.deferred_sample_results_args) - # - Erase the CUDA-side deferred sample_result + # - Erase the GPU-side deferred sample_result # computation args to ensure it is never # pythonized or transferred to CPU output.deferred_sample_results_args = None - # - Compute logprobs + # - Deferred pythonization of logprobs ( prompt_logprobs, sample_logprobs, @@ -494,7 +500,10 @@ def _pythonize_sampler_output(model_input: StatefulModelInput, frozen_model_input.sampling_metadata.skip_sampler_cpu_output) # We are guaranteed output tensors are ready, so it is safe to - # pythonize the sampler output & obtain CPU-side logprobs + # pythonize the sampler output & obtain CPU-side logprobs. + # + # However this computation may be skipped entirely + # if no pythonization was deferred. ( prompt_logprobs, sample_logprobs, From 1284327eeed3a10a1f42a39eeccb4a8b54da1a56 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 16:20:25 -0400 Subject: [PATCH 07/47] removing example --- examples/offline_inference_multi_step.py | 35 ------------------------ 1 file changed, 35 deletions(-) delete mode 100644 examples/offline_inference_multi_step.py diff --git a/examples/offline_inference_multi_step.py b/examples/offline_inference_multi_step.py deleted file mode 100644 index ee00eadf1a977..0000000000000 --- a/examples/offline_inference_multi_step.py +++ /dev/null @@ -1,35 +0,0 @@ -''' -Example of setting up LLM with multi-step enabled. -In actuality, async engine would be a more sensible choice -from a performance perspective. However this example is useful -for demonstration & debugging of multi-step code. -''' - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="JackFram/llama-160m", - swap_space=16, - tensor_parallel_size=1, - gpu_memory_utilization=0.9, - num_scheduler_steps=8, - use_v2_block_manager=True, - enforce_eager=True) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") From a6c1207f74f70155c7f70291eb2c64914c7f76c9 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 16:20:58 -0400 Subject: [PATCH 08/47] removed example from build pipeline --- .buildkite/test-pipeline.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9ed0624cc5347..aa90145705f9d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -156,7 +156,6 @@ steps: - python3 offline_inference_vision_language.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py - - python3 offline_inference_multi_step.py - label: Models Test # 1hr10min source_file_dependencies: From fe42995f7231b646611a038c6e14971f9fda0b17 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:24:16 -0400 Subject: [PATCH 09/47] fixed one docstring; embedded NUM_LOGPROBS --- tests/multi_step/test_correctness.py | 4 +-- vllm/worker/multi_step_model_runner.py | 34 ++++++++++---------------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index 6e3d7af8fb72d..1ff6247167c02 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -23,8 +23,6 @@ "16", ] -NUM_LOGPROBS = [None, 5] # `logprobs` argument to OpenAI completions API - async def completions_with_server_args( prompts: List[str], @@ -159,7 +157,7 @@ def assert_all_close_logprobs( @pytest.mark.parametrize("eager_mode", [False, True]) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) -@pytest.mark.parametrize("num_logprobs", NUM_LOGPROBS) +@pytest.mark.parametrize("num_logprobs", [None, 5]) @pytest.mark.asyncio async def test_multi_step(example_prompts, model: str, tp_size: int, pp_size: int, eager_mode: int, diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index e39ca59f3ae3e..6d2af33498167 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -423,30 +423,22 @@ def deferred_pythonize_logprobs( output: SamplerOutput, sampling_metadata: SamplingMetadata, ) -> DeferredLogprobsReturnType: - ''' - Perform deferred logprob Pythonization. - - 1. Pythonize GPU-side sampler result - tensors into CPU-side sampler result. - 2. Pythonize GPU-side logprobs tensor into - CPU-side logprobs lists, utilizing - the Pythonized sampler result computed - in step 1. - - These deferred computations are not required for - single-step scheduling or the `profile_run()` - phase of multi-step scheduling. - - Arguments: + """Perform deferred logprob Pythonization. - * output: sampler output (under deferred Pythonization) - * sampling_metadata + 1. Pythonize GPU-side sampler result tensors into CPU-side sampler result. + 2. Pythonize GPU-side logprobs tensor into CPU-side logprobs lists, + utilizing the Pythonized sampler result computed in step 1. + + These deferred computations are not required for single-step scheduling + or the `profile_run()` phase of multi-step scheduling. + Args: + output: sampler output (under deferred Pythonization) + sampling_metadata + Returns: - - * prompt_logprobs (CPU) - * sample_logprobs (CPU) - ''' + prompt_logprobs (CPU), sample_logprobs (CPU) + """ # - Deferred pythonization of sample result sampler_result = get_pythonized_sample_results( From 9fb5bbe8edf524f49775f919798000283240bf2f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:38:44 -0400 Subject: [PATCH 10/47] test refactor --- tests/conftest.py | 2 +- tests/multi_step/test_correctness.py | 143 ++------------------------- tests/utils.py | 130 ++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 135 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 08a2c8fcda021..17716967772a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -799,4 +799,4 @@ def dummy_opt_path(): config["architectures"] = ["MyOPTForCausalLM"] with open(json_path, "w") as f: json.dump(config, f) - return _dummy_path + return _dummy_path \ No newline at end of file diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index 1ff6247167c02..32d58e519e1d3 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -1,11 +1,12 @@ # Test the AsyncLLMEngine with multi-step-decoding -from typing import Dict, List, Optional +from typing import List, Optional import pytest -from openai.types.completion import Completion - -from ..utils import RemoteOpenAIServer +from ..utils import (completions_with_server_args, + get_client_text_generations, + get_client_logprob_generations, + assert_all_close_logprobs) MODELS = [ "JackFram/llama-160m", @@ -23,132 +24,6 @@ "16", ] - -async def completions_with_server_args( - prompts: List[str], - model_name: str, - server_cli_args: List[str], - num_logprobs: Optional[int], -) -> Completion: - ''' - Construct a remote OpenAI server, obtain an async client to the - server & invoke the completions API to obtain completions. - - Arguments: - - * prompts: test prompts - * model_name: model to spin up on the vLLM server - * server_cli_args: CLI args for starting the server - - Returns: - - * OpenAI Completion instance - ''' - - outputs = None - with RemoteOpenAIServer(model_name, server_cli_args) as server: - client = server.get_async_client() - outputs = await client.completions.create(model=model_name, - prompt=prompts, - temperature=0, - stream=False, - max_tokens=5, - logprobs=num_logprobs) - assert outputs is not None - - return outputs - - -def get_text_generations(completions: Completion): - '''Obtain generated tokens''' - return [x.text for x in completions.choices] - - -''' -Logprobs values are extracted as List[Optional[List[Dict[str,float]]]], i.e.: -* For each :class:`SequenceGroup`... -* ...if the completions API was invoked with a non-`None` `logprobs` argument: - * ...for each token offset in a sequence... - * ...store a mapping from str(token) -> logprob -* ...else, if the completions API was invoked with `logprobs=None`: - * ...store None -''' -LogprobType = List[Optional[List[Dict[str, float]]]] - - -def get_logprob_generations(completions: Completion) -> LogprobType: - '''Obtain top-rank logprobs for each token in each :class:`SequenceGroup`''' - return [(None if x.logprobs is None else x.logprobs.top_logprobs) - for x in completions.choices] - - -def assert_all_close_logprobs( - ref_logprobs: LogprobType, - test_logprobs: LogprobType, - atol: float = 1e-3, - rtol: float = 1e-3, -) -> None: - ''' - Asserts that logprobs produced by the vLLM engine instance under test - are very close to a set of ground-truth reference values. - - If the completions API was invoked with a non-`None` `logprobs` argument, - then each individual reference logprob must be close to the test logprob, - according to the formula: - - assert abs(tok_top_test_logprob - - tok_top_ref_logprob) <= (atol + - rtol * abs( - tok_top_ref_logprob)) - - Else, if the completions API was invoked with `logprobs=None`, then - both the reference & test log probs should be List[None]. - - Arguments: - - * ref_logprobs: ground-truth logprobs - * test_logprobs: logprobs produced by vLLM engine under test - * atol: absolute mismatch tolerance when comparing single logprobs - * rtol: relative mismatch tolerance when comparing single logprobs - ''' - - assert len(ref_logprobs) == len(test_logprobs), ( - "Reference & test logprob SequenceGroup counts must match.") - - if ref_logprobs[0] is None: - # It is expected that if one :class:`SequenceGroup` has - # `None` logprobs, then all :class:`SequenceGroup`s - # in the reference list have `None` logprobs. - # Validate this. - assert all([x is None for x in ref_logprobs]) - - # Next, assert that this is also true for - # test logprobs. - assert all([x is None for x in test_logprobs]) - return - - for (group_ref_logprobs, - group_test_logprobs) in zip(ref_logprobs, test_logprobs): - - assert group_ref_logprobs is not None - assert group_test_logprobs is not None - assert len(group_ref_logprobs) == len(group_test_logprobs), ( - "Reference & test logprob seq lens must match.") - - for (token_ref_logprobs, - token_test_logprobs) in zip(group_ref_logprobs, - group_test_logprobs): - assert token_ref_logprobs.keys() == token_test_logprobs.keys(), ( - "Reference & test top tokens must match.") - for (tok_str_ref, - tok_top_ref_logprob) in token_ref_logprobs.items(): - tok_top_test_logprob = token_test_logprobs[tok_str_ref] - - # Validate logprobs are numerically very close - assert abs(tok_top_test_logprob - tok_top_ref_logprob) <= ( - atol + rtol * abs(tok_top_ref_logprob)) - - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize(("tp_size, pp_size"), [ (1, 1), @@ -216,14 +91,14 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, # Assert multi-step scheduling produces identical tokens # to single-step scheduling. - ref_generations = get_text_generations(ref_completions) - test_generations = get_text_generations(test_completions) + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) assert ref_generations == test_generations # Assert multi-step scheduling produces nearly-identical logprobs # to single-step scheduling. - ref_logprobs = get_logprob_generations(ref_completions) - test_logprobs = get_logprob_generations(test_completions) + ref_logprobs = get_client_logprob_generations(ref_completions) + test_logprobs = get_client_logprob_generations(test_completions) assert_all_close_logprobs( ref_logprobs, test_logprobs, diff --git a/tests/utils.py b/tests/utils.py index 3e0124fa11352..ad16944574a35 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,6 +8,8 @@ from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Dict, List, Optional +from utils import RemoteOpenAIServer +from openai.types.completion import Completion import openai import requests @@ -414,3 +416,131 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: f" args {args} and kwargs {kwargs}") return wrapper + +async def completions_with_server_args( + prompts: List[str], + model_name: str, + server_cli_args: List[str], + num_logprobs: Optional[int], +) -> Completion: + ''' + Construct a remote OpenAI server, obtain an async client to the + server & invoke the completions API to obtain completions. + + Arguments: + + * prompts: test prompts + * model_name: model to spin up on the vLLM server + * server_cli_args: CLI args for starting the server + + Returns: + + * OpenAI Completion instance + ''' + + outputs = None + with RemoteOpenAIServer(model_name, server_cli_args) as server: + client = server.get_async_client() + outputs = await client.completions.create(model=model_name, + prompt=prompts, + temperature=0, + stream=False, + max_tokens=5, + logprobs=num_logprobs) + assert outputs is not None + + return outputs + +def get_client_text_generations(completions: Completion): + '''Extract generated tokens from the output of a + request made to an Open-AI-protocol completions endpoint.''' + return [x.text for x in completions.choices] + + +'''Logprobs values are extracted as +List[Optional[List[Dict[str,float]]]], i.e.: + +For each :class:`SequenceGroup`... + ...if the completions API was invoked with a non-`None` `logprobs` argument: + ...for each token offset in a sequence... + ...store a mapping from str(token) -> logprob + ...else, if the completions API was invoked with `logprobs=None`: + ...store None +''' +LogprobType = List[Optional[List[Dict[str, float]]]] + + +def get_client_logprob_generations(completions: Completion) -> LogprobType: + '''Operates on the output of a request made to an Open-AI-protocol + completions endpoint; obtains top-rank logprobs for each token in + each :class:`SequenceGroup` + ''' + return [(None if x.logprobs is None else x.logprobs.top_logprobs) + for x in completions.choices] + + +def assert_all_close_logprobs( + ref_logprobs: LogprobType, + test_logprobs: LogprobType, + atol: float = 1e-3, + rtol: float = 1e-3, +) -> None: + ''' + Asserts that logprobs produced by the vLLM engine instance under test + are very close to a set of ground-truth reference values. + + If the completions API was invoked with a non-`None` `logprobs` argument, + then each individual reference logprob must be close to the test logprob, + according to the formula: + + assert abs(tok_top_test_logprob - + tok_top_ref_logprob) <= (atol + + rtol * abs( + tok_top_ref_logprob)) + + Else, if the completions API was invoked with `logprobs=None`, then + both the reference & test log probs should be List[None]. + + Arguments: + + * ref_logprobs: ground-truth logprobs + * test_logprobs: logprobs produced by vLLM engine under test + * atol: absolute mismatch tolerance when comparing single logprobs + * rtol: relative mismatch tolerance when comparing single logprobs + ''' + + assert len(ref_logprobs) == len(test_logprobs), ( + "Reference & test logprob SequenceGroup counts must match.") + + if ref_logprobs[0] is None: + # It is expected that if one :class:`SequenceGroup` has + # `None` logprobs, then all :class:`SequenceGroup`s + # in the reference list have `None` logprobs. + # Validate this. + assert all([x is None for x in ref_logprobs]) + + # Next, assert that this is also true for + # test logprobs. + assert all([x is None for x in test_logprobs]) + return + + for (group_ref_logprobs, + group_test_logprobs) in zip(ref_logprobs, test_logprobs): + + assert group_ref_logprobs is not None + assert group_test_logprobs is not None + assert len(group_ref_logprobs) == len(group_test_logprobs), ( + "Reference & test logprob seq lens must match.") + + for (token_ref_logprobs, + token_test_logprobs) in zip(group_ref_logprobs, + group_test_logprobs): + assert token_ref_logprobs.keys() == token_test_logprobs.keys(), ( + "Reference & test top tokens must match.") + for (tok_str_ref, + tok_top_ref_logprob) in token_ref_logprobs.items(): + tok_top_test_logprob = token_test_logprobs[tok_str_ref] + + # Validate logprobs are numerically very close + assert abs(tok_top_test_logprob - tok_top_ref_logprob) <= ( + atol + rtol * abs(tok_top_ref_logprob)) \ No newline at end of file From 046a8b11c71c0bad49bc56bf158a417ac4374282 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:44:52 -0400 Subject: [PATCH 11/47] incremental refactors --- tests/multi_step/test_correctness.py | 7 ++++--- tests/utils.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index 32d58e519e1d3..743b6f9bb5850 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -3,10 +3,10 @@ from typing import List, Optional import pytest -from ..utils import (completions_with_server_args, - get_client_text_generations, + +from ..utils import (assert_all_close_logprobs, completions_with_server_args, get_client_logprob_generations, - assert_all_close_logprobs) + get_client_text_generations) MODELS = [ "JackFram/llama-160m", @@ -24,6 +24,7 @@ "16", ] + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize(("tp_size, pp_size"), [ (1, 1), diff --git a/tests/utils.py b/tests/utils.py index ad16944574a35..1fcea4adfadee 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,11 +8,10 @@ from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Dict, List, Optional -from utils import RemoteOpenAIServer -from openai.types.completion import Completion import openai import requests +from openai.types.completion import Completion from transformers import AutoTokenizer from typing_extensions import ParamSpec @@ -417,6 +416,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: return wrapper + async def completions_with_server_args( prompts: List[str], model_name: str, @@ -451,6 +451,7 @@ async def completions_with_server_args( return outputs + def get_client_text_generations(completions: Completion): '''Extract generated tokens from the output of a request made to an Open-AI-protocol completions endpoint.''' @@ -543,4 +544,4 @@ def assert_all_close_logprobs( # Validate logprobs are numerically very close assert abs(tok_top_test_logprob - tok_top_ref_logprob) <= ( - atol + rtol * abs(tok_top_ref_logprob)) \ No newline at end of file + atol + rtol * abs(tok_top_ref_logprob)) From fa86efded28e7aad24d67548bbf8dd8a76e44973 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:46:40 -0400 Subject: [PATCH 12/47] remove unnecessary conftest change --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 17716967772a1..08a2c8fcda021 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -799,4 +799,4 @@ def dummy_opt_path(): config["architectures"] = ["MyOPTForCausalLM"] with open(json_path, "w") as f: json.dump(config, f) - return _dummy_path \ No newline at end of file + return _dummy_path From 1c0ffb693540faa5e8b72a3fcdad2918a692ddbb Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:48:14 -0400 Subject: [PATCH 13/47] Update vllm/model_executor/layers/sampler.py Co-authored-by: Cody Yu --- vllm/model_executor/layers/sampler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index c7aa0c20c4193..88b0d8f01405d 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -296,8 +296,7 @@ def forward( prompt_logprobs = None sample_logprobs = None if not sampling_metadata.skip_sampler_cpu_output: - # Pythonize logprobs now (GPU -> CPU); do not - # defer. + # Pythonize logprobs now (GPU -> CPU); do not defer. assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType) prompt_logprobs, sample_logprobs = get_logprobs( From 3babadbf3ab4870926ecf16b157486f3343d952c Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:50:39 -0400 Subject: [PATCH 14/47] refactor --- vllm/model_executor/layers/sampler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index c7aa0c20c4193..3b2d83e181575 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,6 +1,7 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools import warnings +from dataclasses import dataclass from importlib.util import find_spec from math import inf from typing import Dict, List, Optional, Tuple, Union @@ -15,8 +16,6 @@ if HAS_TRITON: from vllm.model_executor.layers.ops.sample import sample as sample_triton -from dataclasses import dataclass - import vllm.envs as envs from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingTensors, From 1875b372b4e2d5c211e3c7ff174bc1880dc9c243 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:53:18 -0400 Subject: [PATCH 15/47] test_multi_step comment --- tests/multi_step/test_correctness.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index 743b6f9bb5850..3ad21be000c94 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -39,8 +39,7 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, pp_size: int, eager_mode: int, num_scheduler_steps: int, num_prompts: int, num_logprobs: Optional[int]): - ''' - Test vLLM engine with multi-step scheduling in an OpenAI-protocol + '''Test vLLM engine with multi-step scheduling in an OpenAI-protocol client/server environment. Set up an engine with single-step scheduling as a ground-truth reference. @@ -51,17 +50,16 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, * Generated tokens match * Generated logprobs are all very close - Arguments: - - * example_prompts: test fixture providing example prompts - * model: model under test (same for single- and multi-step engines) - * tp_size: degree of tensor-parallelism - * pp_size: degree of pipeline-parallelism - * eager_mode - * num_scheduler_steps: for multi-step scheduling, GPU-side steps per + Args: + example_prompts: test fixture providing example prompts + model: model under test (same for single- and multi-step engines) + tp_size: degree of tensor-parallelism + pp_size: degree of pipeline-parallelism + eager_mode + num_scheduler_steps: for multi-step scheduling, GPU-side steps per GPU -> CPU output transfer - * num_prompts: number of example prompts under test - * num_logprobs: corresponds to the `logprobs` argument to the OpenAI + num_prompts: number of example prompts under test + num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> no logprobs ''' From 3760a95628c7488d1a6727c6d114f94c616fb9ab Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:55:16 -0400 Subject: [PATCH 16/47] utils function docstrings --- tests/utils.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 1fcea4adfadee..60b6ad6a0f9a9 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -423,19 +423,16 @@ async def completions_with_server_args( server_cli_args: List[str], num_logprobs: Optional[int], ) -> Completion: - ''' - Construct a remote OpenAI server, obtain an async client to the + '''Construct a remote OpenAI server, obtain an async client to the server & invoke the completions API to obtain completions. - Arguments: - - * prompts: test prompts - * model_name: model to spin up on the vLLM server - * server_cli_args: CLI args for starting the server + Args: + prompts: test prompts + model_name: model to spin up on the vLLM server + server_cli_args: CLI args for starting the server Returns: - - * OpenAI Completion instance + OpenAI Completion instance ''' outputs = None @@ -454,7 +451,8 @@ async def completions_with_server_args( def get_client_text_generations(completions: Completion): '''Extract generated tokens from the output of a - request made to an Open-AI-protocol completions endpoint.''' + request made to an Open-AI-protocol completions endpoint. + ''' return [x.text for x in completions.choices] @@ -486,8 +484,7 @@ def assert_all_close_logprobs( atol: float = 1e-3, rtol: float = 1e-3, ) -> None: - ''' - Asserts that logprobs produced by the vLLM engine instance under test + '''Asserts that logprobs produced by the vLLM engine instance under test are very close to a set of ground-truth reference values. If the completions API was invoked with a non-`None` `logprobs` argument, @@ -502,12 +499,11 @@ def assert_all_close_logprobs( Else, if the completions API was invoked with `logprobs=None`, then both the reference & test log probs should be List[None]. - Arguments: - - * ref_logprobs: ground-truth logprobs - * test_logprobs: logprobs produced by vLLM engine under test - * atol: absolute mismatch tolerance when comparing single logprobs - * rtol: relative mismatch tolerance when comparing single logprobs + Args: + ref_logprobs: ground-truth logprobs + test_logprobs: logprobs produced by vLLM engine under test + atol: absolute mismatch tolerance when comparing single logprobs + rtol: relative mismatch tolerance when comparing single logprobs ''' assert len(ref_logprobs) == len(test_logprobs), ( From d43308c7fedb36b07579c97c43720f59781cea47 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 17:59:03 -0400 Subject: [PATCH 17/47] docstring refactors --- vllm/model_executor/layers/sampler.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index aba49bb941e33..f04542fa8040c 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -674,8 +674,7 @@ def _top_k_top_p_multinomial_with_flashinfer( def get_pythonized_sample_results( sample_result_args: SampleResultArgsType) -> SampleResultType: - ''' - This function consumes GPU-side sampler results and computes + '''This function consumes GPU-side sampler results and computes Pythonized CPU-side sampler results (GPU -> CPU sync.) Single-step scheduling: this function is invoked at sampling-time @@ -684,13 +683,11 @@ def get_pythonized_sample_results( Multi-step scheduling: Pythonization is deferred until after multiple GPU-side steps have been completed. - Arguments: - - * sample_result_args: GPU-side inputs to the Pythonization process + Args: + sample_result_args: GPU-side inputs to the Pythonization process Returns: - - * Pythonized sampler results + Pythonized sampler results ''' ( @@ -737,8 +734,7 @@ def _sample_with_torch( include_gpu_probs_tensor: bool, modify_greedy_probs: bool, ) -> SampleReturnType: - ''' - Torch-oriented _sample() implementation. + '''Torch-oriented _sample() implementation. Single-step scheduling: * Perform GPU-side sampling computation From dfbbaf05228da1da996c715483b9a5961130f8cb Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 21 Aug 2024 19:24:47 -0400 Subject: [PATCH 18/47] passing tests & formatted --- vllm/worker/multi_step_model_runner.py | 56 ++++++++++++++++++++------ 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 6d2af33498167..47b567e98f431 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -54,6 +54,8 @@ class ModelOutput: sampler_output_ready_event: torch.cuda.Event sampled_token_ids: Optional[torch.Tensor] = None pythonized: bool = False + # On-device tensor containing the logprobs of each token. + logprobs: Optional["torch.Tensor"] = None def pythonize(self, input_metadata: "StatefulModelInput", copy_stream: torch.cuda.Stream, @@ -79,7 +81,9 @@ def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput", blocking: bool) -> bool: """ If blocking is set, will block until the forward pass for the output is - ready and pythonize the output. + ready and pythonize the output. Upon completing Pythonization, erases + self.logprobs (note that a non-blocking call that is performed when + the sampler output is not yet ready, will not erase self.logprobs.) """ assert self.sampled_token_ids is not None if not blocking and not self.sampler_output_ready_event.query(): @@ -90,7 +94,15 @@ def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput", with torch.cuda.stream(copy_stream): _pythonize_sampler_output(input_metadata, self.sampler_output, pinned_sampled_token_buffer, - self.sampled_token_ids) + self.sampled_token_ids, self.logprobs) + + # Erase the logprobs GPU-side tensor. + # Note that although _pythonize_sampler_output() runs in its + # own CUDA stream, nonetheless _pythonize_sampler_output() + # cannot return until Pythonization is complete; therefore + # we know that by the time the CPU reaches this point, + # `self.logprobs` is no longer needed. + self.logprobs = None return True @@ -297,13 +309,15 @@ def execute_model( 0].sampled_token_ids.cpu() model_input.cached_outputs.append( ModelOutput(output[0], output_ready_event, - output[0].sampled_token_ids, False)) + output[0].sampled_token_ids, False, + output[0].logprobs)) # These GPU tensors are not required by multi-step; # erase them to ensure they are not pythonized or # transferred to CPU output[0].sampled_token_ids = None output[0].sampled_token_probs = None + output[0].logprobs = None # Pythonize the output if CPU is ahead and the previous step is # ready. @@ -422,6 +436,7 @@ def vocab_size(self) -> int: def deferred_pythonize_logprobs( output: SamplerOutput, sampling_metadata: SamplingMetadata, + logprobs_tensor: Optional[torch.Tensor], ) -> DeferredLogprobsReturnType: """Perform deferred logprob Pythonization. @@ -453,22 +468,36 @@ def deferred_pythonize_logprobs( ( prompt_logprobs, sample_logprobs, - ) = get_logprobs(output.logprobs, sampling_metadata, sampler_result) + ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result) assert len(prompt_logprobs) == len(sampling_metadata.seq_groups) assert len(sample_logprobs) == len(sampling_metadata.seq_groups) - # Erase the logprobs GPU tensor to ensure it is never pythonized - # or transferred to CPU - output.logprobs = None return prompt_logprobs, sample_logprobs -def _pythonize_sampler_output(model_input: StatefulModelInput, - output: SamplerOutput, - pinned_sampled_token_buffer: torch.Tensor, - sampled_token_ids: torch.Tensor) -> None: +def _pythonize_sampler_output( + model_input: StatefulModelInput, + output: SamplerOutput, + pinned_sampled_token_buffer: torch.Tensor, + sampled_token_ids: torch.Tensor, + logprobs_tensor: Optional[torch.Tensor], +) -> None: """ This function is only called when the output tensors are ready. - See ModelOutput + See :class:`ModelOutput`. + + Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, + adding a Pythonized output data structure + (:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`. + + Args: + model_input + output: sampler output + pinned_sampled_token_token_buffer: CPU-side pinned memory + (receives copy of + GPU-side token buffer.) + sampled_token_ids: GPU-side token buffer + logprobs_tensor: GPU-side tensor containing + logprobs computed during sampling """ assert model_input.frozen_model_input is not None @@ -499,7 +528,8 @@ def _pythonize_sampler_output(model_input: StatefulModelInput, ( prompt_logprobs, sample_logprobs, - ) = (deferred_pythonize_logprobs(output, sampling_metadata) + ) = (deferred_pythonize_logprobs(output, sampling_metadata, + logprobs_tensor) if skip_sampler_cpu_output else (None, None)) for sgdx, (seq_group, sample_result) in enumerate( From e0d59cea8a409584e83b3ef1f50ff7987a3181e1 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 22 Aug 2024 11:40:22 -0400 Subject: [PATCH 19/47] removed incorrect SamplerOutput imports --- vllm/model_executor/models/eagle.py | 3 ++- vllm/model_executor/models/ultravox.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 99c825ff63572..515fe17d4110b 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -4,13 +4,14 @@ import torch.nn as nn from vllm.attention.backends.abstract import AttentionMetadata +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import ModelRegistry from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.eagle import EAGLEConfig diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 842264f765866..c49dc5889b8f7 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -37,7 +37,8 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) -from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SamplerOutput, SequenceData +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData from vllm.transformers_utils.configs.ultravox import UltravoxConfig _AUDIO_PLACEHOLDER_TOKEN = 128002 From 102fd92b21f960801c4622d6190c37cbdba0805d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 22 Aug 2024 11:43:46 -0400 Subject: [PATCH 20/47] formatting --- vllm/model_executor/models/eagle.py | 2 +- vllm/model_executor/models/ultravox.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 515fe17d4110b..ad1ab0231d861 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -4,8 +4,8 @@ import torch.nn as nn from vllm.attention.backends.abstract import AttentionMetadata -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import default_weight_loader diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index c49dc5889b8f7..74de9565ca980 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -27,6 +27,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.models.utils import (filter_weights, @@ -37,7 +38,6 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData from vllm.transformers_utils.configs.ultravox import UltravoxConfig From 948f4ef42512c90d67ce7cd9793fd7e11408504a Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:29:55 -0400 Subject: [PATCH 21/47] Update tests/multi_step/test_correctness.py Co-authored-by: Cody Yu --- tests/multi_step/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index 3ad21be000c94..ea0181e2bf47c 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -39,7 +39,7 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, pp_size: int, eager_mode: int, num_scheduler_steps: int, num_prompts: int, num_logprobs: Optional[int]): - '''Test vLLM engine with multi-step scheduling in an OpenAI-protocol + """Test vLLM engine with multi-step scheduling in an OpenAI-protocol client/server environment. Set up an engine with single-step scheduling as a ground-truth reference. From 6e6711f5ce8b5f26ca9954a32c2b949c2a50d15f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 22 Aug 2024 13:32:05 -0400 Subject: [PATCH 22/47] fixed comment --- tests/multi_step/test_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index ea0181e2bf47c..945602f6b7b2f 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -61,7 +61,7 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, num_prompts: number of example prompts under test num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> no logprobs - ''' + """ prompts = example_prompts if len(prompts) < num_prompts: From 1cc93dd7150a8a62d1b22fe8d6910df18ee78af4 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 23 Aug 2024 05:38:02 -0400 Subject: [PATCH 23/47] rename --- .../basic_correctness/test_chunked_prefill.py | 5 +-- ...t_basic_distributed_correctness_enc_dec.py | 4 +-- tests/models/test_bart.py | 4 +-- tests/models/test_blip2.py | 4 +-- tests/models/test_fuyu.py | 4 +-- tests/models/test_gguf.py | 4 +-- tests/models/test_gptq_marlin.py | 4 +-- tests/models/test_gptq_marlin_24.py | 4 +-- tests/models/test_internvl.py | 6 ++-- tests/models/test_llava.py | 4 +-- tests/models/test_llava_image_embeds.py | 4 +-- tests/models/test_llava_next.py | 4 +-- tests/models/test_marlin.py | 4 +-- tests/models/test_minicpmv.py | 6 ++-- tests/models/test_mistral.py | 4 +-- tests/models/test_paligemma.py | 4 +-- tests/models/test_phi3v.py | 4 +-- tests/models/test_qwen.py | 4 +-- tests/models/test_ultravox.py | 4 +-- tests/models/utils.py | 34 +++++++++++-------- 20 files changed, 61 insertions(+), 54 deletions(-) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 9c6364ecc6792..559a08a828d03 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -9,7 +9,8 @@ import pytest -from ..models.utils import check_logprobs_close, check_outputs_equal +from ..models.utils import (check_outputs_equal, + check_tokenstexts_match_or_in_top_logprobs) MODELS = [ "facebook/opt-125m", @@ -144,7 +145,7 @@ def test_models_with_fp8_kv_cache( chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=no_chunked_prefill_outputs, outputs_1_lst=chunked_prefill_outputs, name_0="no_chunked_prefill", diff --git a/tests/distributed/test_basic_distributed_correctness_enc_dec.py b/tests/distributed/test_basic_distributed_correctness_enc_dec.py index f00d5ef584a2a..0334108ad6a82 100644 --- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py +++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py @@ -15,7 +15,7 @@ from vllm.utils import cuda_device_count_stateless from ..conftest import DecoderPromptType -from ..models.utils import check_logprobs_close +from ..models.utils import check_tokenstexts_match_or_in_top_logprobs from ..utils import fork_new_process_for_each_test @@ -94,7 +94,7 @@ def test_models( **hf_kwargs, )) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 660b61d1a7ade..d448aea75be61 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -17,7 +17,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import DecoderPromptType - from .utils import check_logprobs_close + from .utils import check_tokenstexts_match_or_in_top_logprobs MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] @@ -158,7 +158,7 @@ def test_models( hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE else 0) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, decoder_prompt_type) diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py index 5d48bad0d7b35..625afb4541f37 100644 --- a/tests/models/test_blip2.py +++ b/tests/models/test_blip2.py @@ -7,7 +7,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -92,7 +92,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py index 0d666d8f71a92..37f929c467ece 100644 --- a/tests/models/test_fuyu.py +++ b/tests/models/test_fuyu.py @@ -7,7 +7,7 @@ from vllm.utils import is_cpu from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -96,7 +96,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py index 196cd88e039a1..7699d604d4f03 100644 --- a/tests/models/test_gguf.py +++ b/tests/models/test_gguf.py @@ -11,7 +11,7 @@ from tests.quantization.utils import is_quant_method_supported -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -82,7 +82,7 @@ def test_models( gguf_outputs = gguf_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=original_outputs, outputs_1_lst=gguf_outputs, name_0="original", diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 4abbc41c9c287..2f16db775ea65 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -15,7 +15,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -88,7 +88,7 @@ def test_models( gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=gptq_outputs, outputs_1_lst=gptq_marlin_outputs, name_0="gptq", diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 60d9ae2f1c629..72d4be3ed7fd8 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -10,7 +10,7 @@ import pytest -from tests.models.utils import check_logprobs_close +from tests.models.utils import check_tokenstexts_match_or_in_top_logprobs from tests.quantization.utils import is_quant_method_supported @@ -63,7 +63,7 @@ def test_models( gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=gptq_outputs, outputs_1_lst=marlin_24_outputs, name_0="gptq", diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index d032f3be84b58..684ff6c169975 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -14,7 +14,7 @@ from vllm.utils import is_cpu from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -170,7 +170,7 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", @@ -238,7 +238,7 @@ def run_awq_test( quant_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=source_outputs, outputs_1_lst=quant_outputs, name_0="source", diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 93634f245cee7..69f4a65d27286 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -9,7 +9,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -138,7 +138,7 @@ def process(hf_inputs: BatchEncoding): vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/test_llava_image_embeds.py index cc444fe32e79b..0487063eacc8f 100644 --- a/tests/models/test_llava_image_embeds.py +++ b/tests/models/test_llava_image_embeds.py @@ -6,7 +6,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -119,7 +119,7 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index 9cf55c0858df0..150b588da0cd3 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -7,7 +7,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -143,7 +143,7 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index e86f6e29d1567..bd8c1e903e07b 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -16,7 +16,7 @@ from tests.quantization.utils import is_quant_method_supported -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs @dataclass @@ -61,7 +61,7 @@ def test_models( gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=gptq_outputs, outputs_1_lst=marlin_outputs, name_0="gptq", diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index bf72dad0d1f5b..6d4e2b61727da 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -9,7 +9,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -109,7 +109,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=[ trunc_hf_output(hf_output) for hf_output in hf_outputs ], @@ -226,7 +226,7 @@ def run_multi_image_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case): - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=[ trunc_hf_output(hf_output) for hf_output in hf_outputs ], diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 6acc057fe588c..7bb24cb89b960 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -4,7 +4,7 @@ """ import pytest -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", @@ -33,7 +33,7 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py index beddaaf608a18..fd8f308dbeaf1 100644 --- a/tests/models/test_paligemma.py +++ b/tests/models/test_paligemma.py @@ -9,7 +9,7 @@ from vllm.utils import is_hip from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -115,7 +115,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 197e63b1b1e52..65247ef6ecde5 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -10,7 +10,7 @@ from vllm.utils import is_cpu, is_hip from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -124,7 +124,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_qwen.py b/tests/models/test_qwen.py index 0f974fcc1885c..101954f28fc20 100644 --- a/tests/models/test_qwen.py +++ b/tests/models/test_qwen.py @@ -3,7 +3,7 @@ import pytest from ..conftest import HfRunner, VllmRunner -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs models = ["qwen/qwen-vl"] @@ -40,7 +40,7 @@ def test_text_only_qwen_model( num_logprobs=num_logprobs, ) - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py index 98de10aa08408..b898c1ab936ae 100644 --- a/tests/models/test_ultravox.py +++ b/tests/models/test_ultravox.py @@ -10,7 +10,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ..conftest import HfRunner, VllmRunner -from .utils import check_logprobs_close +from .utils import check_tokenstexts_match_or_in_top_logprobs pytestmark = pytest.mark.vlm @@ -123,7 +123,7 @@ def process(hf_inputs: BatchEncoding): for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio, vllm_outputs_per_audio): - check_logprobs_close( + check_tokenstexts_match_or_in_top_logprobs( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/utils.py b/tests/models/utils.py index ff29a0ae81d6e..40e025c29d06f 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -39,7 +39,7 @@ def check_outputs_equal( SampleLogprobs]]] -def check_logprobs_close( +def check_tokenstexts_match_or_in_top_logprobs( *, outputs_0_lst: Sequence[TokensTextLogprobs], outputs_1_lst: Sequence[TokensTextLogprobs], @@ -47,24 +47,30 @@ def check_logprobs_close( name_1: str, num_outputs_0_skip_tokens: int = 0, warn_on_mismatch: bool = True, -): - """ - Compare the logprobs of two sequences generated by different models, - which should be similar but not necessarily equal. - - Arguments: - - * outputs_0_lst: First sequence to compare - * outputs_0_lst: Second sequence to compare - * name_0: sequence #0 name - * name_1: sequence #1 name - * num_outputs_0_skip_tokens: If > 0, specifies the number of initial +) -> None: + """Compare two sequences, allowing for small differences + resulting from rounding error. + + The sequence can be represented by token lists or text, coupled + with logprobs. + + The primary test for a match is whether the text/tokens are identical. + However, if the text/tokens mismatch at a particular sequence offset, + the fallback is to check where the sequence 0 token is in the set + of top logprobs associated with sequence 1, and visa-versa. + + Args: + outputs_0_lst: First sequence to compare + outputs_0_lst: Second sequence to compare + name_0: sequence #0 name + name_1: sequence #1 name + num_outputs_0_skip_tokens: If > 0, specifies the number of initial sequence #0 tokens & logprobs to discard before comparison, i.e. all of sequence #1 will be compared to sequence #0 beginning at index num_outputs_0_skip_tokens - * warn_on_mismatch: Issue a warning if there is token-wise or text-wise + warn_on_mismatch: Issue a warning if there is token-wise or text-wise mismatch between the two sequences """ assert len(outputs_0_lst) == len(outputs_1_lst) From da5826b83b7e1fead1b17ed58b819a6d0444232e Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Mon, 26 Aug 2024 07:29:15 -0400 Subject: [PATCH 24/47] test modification --- tests/multi_step/test_correctness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/multi_step/test_correctness.py b/tests/multi_step/test_correctness.py index 945602f6b7b2f..ddfc1a7844591 100644 --- a/tests/multi_step/test_correctness.py +++ b/tests/multi_step/test_correctness.py @@ -101,6 +101,6 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, assert_all_close_logprobs( ref_logprobs, test_logprobs, - atol=1e-5, - rtol=1e-5, + atol=1e-2, + rtol=1e-2, ) From 1e4265628e04b22f35df582b13eddcbb24e45075 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 04:21:35 -0400 Subject: [PATCH 25/47] formatting --- tests/multi_step/test_correctness_async_llm.py | 1 + vllm/engine/llm_engine.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index a2a14d2849bfb..70b58eeb2e78d 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -24,6 +24,7 @@ "16", ] + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize(("tp_size, pp_size"), [ (1, 1), diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index cbd1b71b1b738..0afefeb87fc1c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -40,8 +40,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest, - Sequence, SequenceGroup, - SequenceGroupMetadata, SequenceStatus) + Sequence, SequenceGroup, SequenceGroupMetadata, + SequenceStatus) from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, init_tracer) from vllm.transformers_utils.config import try_get_generation_config From cd0fdf9e547e3a41076257c3f1d62ca1e7a3238a Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 04:45:07 -0400 Subject: [PATCH 26/47] disabled logprobs pythonization when logprobs are disabled --- vllm/worker/multi_step_model_runner.py | 42 +++++++++++++++++++------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 47b567e98f431..3c6a1183e71b7 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -18,7 +18,7 @@ SamplingMetadata, get_logprobs, get_pythonized_sample_results) from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - SequenceGroupMetadata, SequenceOutput) + SequenceGroupMetadata, SequenceOutput, Logprob) from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( @@ -525,17 +525,24 @@ def _pythonize_sampler_output( # # However this computation may be skipped entirely # if no pythonization was deferred. + seq_groups = sampling_metadata.seq_groups + logprobs_are_requested = any([ + sg.sampling_params.logprobs is not None + or sg.sampling_params.prompt_logprobs is not None for sg in seq_groups + ]) + do_pythonize_logprobs = (skip_sampler_cpu_output + and logprobs_are_requested) ( prompt_logprobs, sample_logprobs, ) = (deferred_pythonize_logprobs(output, sampling_metadata, logprobs_tensor) - if skip_sampler_cpu_output else (None, None)) + if do_pythonize_logprobs else (None, None)) - for sgdx, (seq_group, sample_result) in enumerate( - zip(sampling_metadata.seq_groups, samples_list)): + for sgdx, (seq_group, + sample_result) in enumerate(zip(seq_groups, samples_list)): - if skip_sampler_cpu_output: + if do_pythonize_logprobs: assert prompt_logprobs is not None assert sample_logprobs is not None @@ -545,7 +552,12 @@ def _pythonize_sampler_output( ) = ( # Utilize deferred pythonization results prompt_logprobs[sgdx], sample_logprobs[sgdx], - ) if skip_sampler_cpu_output else ( + ) + elif logprobs_are_requested: + ( + group_prompt_logprobs, + group_sample_logprobs, + ) = ( # profile_run: use already-computed logprobs output.outputs[sgdx].prompt_logprobs, [sample.logprobs for sample in output.outputs[sgdx].samples]) @@ -557,11 +569,19 @@ def _pythonize_sampler_output( if seq_group.sampling_params.logits_processors: assert len(seq_group.sampling_params.logits_processors) == 0, ( "Logits Processors are not supported in multi-step decoding") - for (parent_id, next_token_id, - logprobs) in zip(parent_ids, next_token_ids, - group_sample_logprobs): + for tdx, (parent_id, + next_token_id) in enumerate(zip(parent_ids, next_token_ids)): seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) + SequenceOutput(seq_ids[parent_id], next_token_id, + (group_sample_logprobs[tdx] + if logprobs_are_requested else { + next_token_id: + Logprob(logprob=float('inf'), + rank=None, + decoded_token=None) + }))) output.outputs.append( - CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs)) + CompletionSequenceGroupOutput( + seq_outputs, + (group_prompt_logprobs if logprobs_are_requested else None))) assert len(output.outputs) > 0 From 3fecbc4e2cb5342c95964d2f4e833968fa83fde5 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 05:39:13 -0400 Subject: [PATCH 27/47] wip --- .../basic_correctness/test_chunked_prefill.py | 5 +- ...t_basic_distributed_correctness_enc_dec.py | 4 +- tests/models/test_bart.py | 4 +- tests/models/test_blip2.py | 4 +- tests/models/test_fuyu.py | 4 +- tests/models/test_gguf.py | 4 +- tests/models/test_gptq_marlin.py | 4 +- tests/models/test_gptq_marlin_24.py | 4 +- tests/models/test_internvl.py | 6 +- tests/models/test_llava.py | 4 +- tests/models/test_llava_image_embeds.py | 4 +- tests/models/test_llava_next.py | 4 +- tests/models/test_marlin.py | 4 +- tests/models/test_minicpmv.py | 6 +- tests/models/test_mistral.py | 4 +- tests/models/test_paligemma.py | 4 +- tests/models/test_phi3v.py | 6 +- tests/models/test_qwen.py | 4 +- tests/models/test_ultravox.py | 4 +- tests/models/utils.py | 45 ++++++------ tests/multi_step/test_correctness_llm.py | 69 +++++++++++++------ 21 files changed, 111 insertions(+), 86 deletions(-) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index d086b882717ad..deb02c1ed4d2d 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -9,8 +9,7 @@ import pytest -from ..models.utils import (check_outputs_equal, - check_tokenstexts_match_or_in_top_logprobs) +from ..models.utils import (check_outputs_equal, check_logprobs_close) MODELS = [ "facebook/opt-125m", @@ -151,7 +150,7 @@ def test_models_with_fp8_kv_cache( chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=no_chunked_prefill_outputs, outputs_1_lst=chunked_prefill_outputs, name_0="no_chunked_prefill", diff --git a/tests/distributed/test_basic_distributed_correctness_enc_dec.py b/tests/distributed/test_basic_distributed_correctness_enc_dec.py index 0334108ad6a82..f00d5ef584a2a 100644 --- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py +++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py @@ -15,7 +15,7 @@ from vllm.utils import cuda_device_count_stateless from ..conftest import DecoderPromptType -from ..models.utils import check_tokenstexts_match_or_in_top_logprobs +from ..models.utils import check_logprobs_close from ..utils import fork_new_process_for_each_test @@ -94,7 +94,7 @@ def test_models( **hf_kwargs, )) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index d448aea75be61..660b61d1a7ade 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -17,7 +17,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import DecoderPromptType - from .utils import check_tokenstexts_match_or_in_top_logprobs + from .utils import check_logprobs_close MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] @@ -158,7 +158,7 @@ def test_models( hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE else 0) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, decoder_prompt_type) diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py index 625afb4541f37..5d48bad0d7b35 100644 --- a/tests/models/test_blip2.py +++ b/tests/models/test_blip2.py @@ -7,7 +7,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -92,7 +92,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py index 37f929c467ece..0d666d8f71a92 100644 --- a/tests/models/test_fuyu.py +++ b/tests/models/test_fuyu.py @@ -7,7 +7,7 @@ from vllm.utils import is_cpu from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -96,7 +96,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py index 7699d604d4f03..196cd88e039a1 100644 --- a/tests/models/test_gguf.py +++ b/tests/models/test_gguf.py @@ -11,7 +11,7 @@ from tests.quantization.utils import is_quant_method_supported -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -82,7 +82,7 @@ def test_models( gguf_outputs = gguf_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=original_outputs, outputs_1_lst=gguf_outputs, name_0="original", diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 2f16db775ea65..4abbc41c9c287 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -15,7 +15,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -88,7 +88,7 @@ def test_models( gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=gptq_outputs, outputs_1_lst=gptq_marlin_outputs, name_0="gptq", diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 72d4be3ed7fd8..60d9ae2f1c629 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -10,7 +10,7 @@ import pytest -from tests.models.utils import check_tokenstexts_match_or_in_top_logprobs +from tests.models.utils import check_logprobs_close from tests.quantization.utils import is_quant_method_supported @@ -63,7 +63,7 @@ def test_models( gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=gptq_outputs, outputs_1_lst=marlin_24_outputs, name_0="gptq", diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index 51d9e6451f2d1..243bc857c88de 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -13,7 +13,7 @@ from vllm.utils import is_cpu from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -164,7 +164,7 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", @@ -232,7 +232,7 @@ def run_awq_test( quant_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=source_outputs, outputs_1_lst=quant_outputs, name_0="source", diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 69f4a65d27286..93634f245cee7 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -9,7 +9,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -138,7 +138,7 @@ def process(hf_inputs: BatchEncoding): vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/test_llava_image_embeds.py index 0487063eacc8f..cc444fe32e79b 100644 --- a/tests/models/test_llava_image_embeds.py +++ b/tests/models/test_llava_image_embeds.py @@ -6,7 +6,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -119,7 +119,7 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index 150b588da0cd3..9cf55c0858df0 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -7,7 +7,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -143,7 +143,7 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index bd8c1e903e07b..e86f6e29d1567 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -16,7 +16,7 @@ from tests.quantization.utils import is_quant_method_supported -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close @dataclass @@ -61,7 +61,7 @@ def test_models( gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=gptq_outputs, outputs_1_lst=marlin_outputs, name_0="gptq", diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index 6d4e2b61727da..bf72dad0d1f5b 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -9,7 +9,7 @@ from vllm.sequence import SampleLogprobs from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -109,7 +109,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=[ trunc_hf_output(hf_output) for hf_output in hf_outputs ], @@ -226,7 +226,7 @@ def run_multi_image_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=[ trunc_hf_output(hf_output) for hf_output in hf_outputs ], diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 7bb24cb89b960..6acc057fe588c 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -4,7 +4,7 @@ """ import pytest -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", @@ -33,7 +33,7 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py index fd8f308dbeaf1..beddaaf608a18 100644 --- a/tests/models/test_paligemma.py +++ b/tests/models/test_paligemma.py @@ -9,7 +9,7 @@ from vllm.utils import is_hip from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -115,7 +115,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 3f56c3511f48f..259cbe515066d 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -11,7 +11,7 @@ from vllm.utils import is_cpu, is_hip from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -124,7 +124,7 @@ def run_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, vllm_outputs_per_image): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) @@ -253,7 +253,7 @@ def run_multi_image_test( for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/test_qwen.py b/tests/models/test_qwen.py index 101954f28fc20..0f974fcc1885c 100644 --- a/tests/models/test_qwen.py +++ b/tests/models/test_qwen.py @@ -3,7 +3,7 @@ import pytest from ..conftest import HfRunner, VllmRunner -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close models = ["qwen/qwen-vl"] @@ -40,7 +40,7 @@ def test_text_only_qwen_model( num_logprobs=num_logprobs, ) - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py index b898c1ab936ae..98de10aa08408 100644 --- a/tests/models/test_ultravox.py +++ b/tests/models/test_ultravox.py @@ -10,7 +10,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ..conftest import HfRunner, VllmRunner -from .utils import check_tokenstexts_match_or_in_top_logprobs +from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -123,7 +123,7 @@ def process(hf_inputs: BatchEncoding): for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio, vllm_outputs_per_audio): - check_tokenstexts_match_or_in_top_logprobs( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output, model) diff --git a/tests/models/utils.py b/tests/models/utils.py index 40e025c29d06f..2a33d139fb6a2 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,7 +1,7 @@ import warnings from typing import Dict, List, Optional, Sequence, Tuple, Union -from vllm.sequence import SampleLogprobs +from vllm.sequence import SampleLogprobs, Logprob TokensText = Tuple[List[int], str] @@ -38,39 +38,38 @@ def check_outputs_equal( float]], SampleLogprobs]]] +# OpenAI-API-style logprobs +OpenAIAPILogprobs = Tuple[List[str], str, + Optional[Union[List[Dict[str, float]], + List[Dict[str, Logprob]]]]] -def check_tokenstexts_match_or_in_top_logprobs( + +def check_logprobs_close( *, - outputs_0_lst: Sequence[TokensTextLogprobs], - outputs_1_lst: Sequence[TokensTextLogprobs], + outputs_0_lst: Sequence[Union[TokensTextLogprobs, OpenAIAPILogprobs]], + outputs_1_lst: Sequence[Union[TokensTextLogprobs, OpenAIAPILogprobs]], name_0: str, name_1: str, num_outputs_0_skip_tokens: int = 0, warn_on_mismatch: bool = True, -) -> None: - """Compare two sequences, allowing for small differences - resulting from rounding error. - - The sequence can be represented by token lists or text, coupled - with logprobs. - - The primary test for a match is whether the text/tokens are identical. - However, if the text/tokens mismatch at a particular sequence offset, - the fallback is to check where the sequence 0 token is in the set - of top logprobs associated with sequence 1, and visa-versa. - - Args: - outputs_0_lst: First sequence to compare - outputs_0_lst: Second sequence to compare - name_0: sequence #0 name - name_1: sequence #1 name - num_outputs_0_skip_tokens: If > 0, specifies the number of initial +): + """ + Compare the logprobs of two sequences generated by different models, + which should be similar but not necessarily equal. + + Arguments: + + * outputs_0_lst: First sequence to compare + * outputs_0_lst: Second sequence to compare + * name_0: sequence #0 name + * name_1: sequence #1 name + * num_outputs_0_skip_tokens: If > 0, specifies the number of initial sequence #0 tokens & logprobs to discard before comparison, i.e. all of sequence #1 will be compared to sequence #0 beginning at index num_outputs_0_skip_tokens - warn_on_mismatch: Issue a warning if there is token-wise or text-wise + * warn_on_mismatch: Issue a warning if there is token-wise or text-wise mismatch between the two sequences """ assert len(outputs_0_lst) == len(outputs_1_lst) diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 36f610ba74f05..0bee462210c1c 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -1,8 +1,9 @@ # Test the LLMEngine with multi-step-decoding import pytest +from typing import Optional -from ..models.utils import check_outputs_equal +from ..models.utils import check_outputs_equal, check_logprobs_close MODELS = [ "JackFram/llama-160m", @@ -18,10 +19,20 @@ @pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) -def test_multi_step_llm(hf_runner, vllm_runner, example_prompts, model: str, - dtype: str, tp_size: int, max_tokens: int, - enforce_eager: int, num_scheduler_steps: int, - num_prompts: int) -> None: +@pytest.mark.parametrize("num_logprobs", [None, 5]) +def test_multi_step_llm( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + tp_size: int, + max_tokens: int, + enforce_eager: int, + num_scheduler_steps: int, + num_prompts: int, + num_logprobs: Optional[int], +) -> None: prompts = example_prompts if len(prompts) < num_prompts: @@ -29,21 +40,37 @@ def test_multi_step_llm(hf_runner, vllm_runner, example_prompts, model: str, prompts = prompts[:num_prompts] assert len(prompts) == num_prompts - with vllm_runner(model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - use_v2_block_manager=True, - num_scheduler_steps=num_scheduler_steps) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + use_v2_block_manager=True, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + vllm_model.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs)) with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + hf_model.generate_greedy_logprobs_limit( + prompts, max_tokens, num_logprobs)) + + if num_logprobs is None: + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) From 67bd035bb7f0c70a0abaa52f170e1fd02f5ea7c7 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 06:48:56 -0400 Subject: [PATCH 28/47] skip logprobs processing entirely when logprobs are not enabled; formatting --- .../basic_correctness/test_chunked_prefill.py | 2 +- tests/models/utils.py | 42 ++++++++++--------- .../multi_step/test_correctness_async_llm.py | 17 +++++--- tests/multi_step/test_correctness_llm.py | 5 ++- tests/utils.py | 22 ++++------ vllm/worker/multi_step_model_runner.py | 2 +- 6 files changed, 47 insertions(+), 43 deletions(-) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index deb02c1ed4d2d..1211e6ba5aafc 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -9,7 +9,7 @@ import pytest -from ..models.utils import (check_outputs_equal, check_logprobs_close) +from ..models.utils import check_logprobs_close, check_outputs_equal MODELS = [ "facebook/opt-125m", diff --git a/tests/models/utils.py b/tests/models/utils.py index 2a33d139fb6a2..2e7ab51ec4146 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,7 +1,7 @@ import warnings from typing import Dict, List, Optional, Sequence, Tuple, Union -from vllm.sequence import SampleLogprobs, Logprob +from vllm.sequence import Logprob, SampleLogprobs TokensText = Tuple[List[int], str] @@ -39,38 +39,38 @@ def check_outputs_equal( SampleLogprobs]]] # OpenAI-API-style logprobs -OpenAIAPILogprobs = Tuple[List[str], str, - Optional[Union[List[Dict[str, float]], - List[Dict[str, Logprob]]]]] +TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], + List[Dict[str, + Logprob]]]]] def check_logprobs_close( *, - outputs_0_lst: Sequence[Union[TokensTextLogprobs, OpenAIAPILogprobs]], - outputs_1_lst: Sequence[Union[TokensTextLogprobs, OpenAIAPILogprobs]], + outputs_0_lst: Sequence[Union[TokensTextLogprobs, TextTextLogprobs]], + outputs_1_lst: Sequence[Union[TokensTextLogprobs, TextTextLogprobs]], name_0: str, name_1: str, num_outputs_0_skip_tokens: int = 0, warn_on_mismatch: bool = True, + always_check_logprobs: bool = False, ): - """ - Compare the logprobs of two sequences generated by different models, + """Compare the logprobs of two sequences generated by different models, which should be similar but not necessarily equal. - Arguments: - - * outputs_0_lst: First sequence to compare - * outputs_0_lst: Second sequence to compare - * name_0: sequence #0 name - * name_1: sequence #1 name - * num_outputs_0_skip_tokens: If > 0, specifies the number of initial + Args: + outputs_0_lst: First sequence to compare + outputs_0_lst: Second sequence to compare + name_0: sequence #0 name + name_1: sequence #1 name + num_outputs_0_skip_tokens: If > 0, specifies the number of initial sequence #0 tokens & logprobs to discard before comparison, i.e. all of sequence #1 will be compared to sequence #0 beginning at index num_outputs_0_skip_tokens - * warn_on_mismatch: Issue a warning if there is token-wise or text-wise + warn_on_mismatch: Issue a warning if there is token-wise or text-wise mismatch between the two sequences + always_check_logprobs: If true, check logprobs even when tokens match """ assert len(outputs_0_lst) == len(outputs_1_lst) @@ -99,8 +99,12 @@ def check_logprobs_close( for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): - # If generated tokens don't match, then - if output_id_0 != output_id_1: + is_tok_mismatch = output_id_0 != output_id_1 + + # If generated tokens don't match + # or it is desired to always check logprobs, + # then + if is_tok_mismatch or always_check_logprobs: logprobs_elem_0 = logprobs_0[idx] logprobs_elem_1 = logprobs_1[idx] @@ -116,7 +120,7 @@ def check_logprobs_close( assert output_id_0 in logprobs_elem_1, fail_msg assert output_id_1 in logprobs_elem_0, fail_msg - if warn_on_mismatch: + if warn_on_mismatch and is_tok_mismatch: with warnings.catch_warnings(): # This ensures that repeated warnings are shown # in the output, not just the first occurrence diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 70b58eeb2e78d..21ea5ce3af776 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -4,9 +4,9 @@ import pytest -from ..utils import (completions_with_server_args, - get_client_logprob_generations, - get_client_text_generations) +from ..models.utils import check_logprobs_close +from ..utils import (completions_with_server_args, get_client_text_generations, + get_client_text_logprob_generations) MODELS = [ "JackFram/llama-160m", @@ -99,5 +99,12 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, # Assert multi-step scheduling produces nearly-identical logprobs # to single-step scheduling. - ref_logprobs = get_client_logprob_generations(ref_completions) - test_logprobs = get_client_logprob_generations(test_completions) + ref_text_logprobs = get_client_text_logprob_generations(ref_completions) + test_text_logprobs = get_client_text_logprob_generations(test_completions) + + check_logprobs_close( + outputs_0_lst=ref_text_logprobs, + outputs_1_lst=test_text_logprobs, + name_0="hf", + name_1="vllm", + ) \ No newline at end of file diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 0bee462210c1c..6aa6ec0fbb64e 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -1,9 +1,10 @@ # Test the LLMEngine with multi-step-decoding -import pytest from typing import Optional -from ..models.utils import check_outputs_equal, check_logprobs_close +import pytest + +from ..models.utils import check_logprobs_close, check_outputs_equal MODELS = [ "JackFram/llama-160m", diff --git a/tests/utils.py b/tests/utils.py index bdf068ed4fa82..58efb91ce78dc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -15,6 +15,7 @@ from transformers import AutoTokenizer from typing_extensions import ParamSpec +from tests.models.utils import TextTextLogprobs from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.engine.arg_utils import AsyncEngineArgs @@ -474,23 +475,14 @@ def get_client_text_generations(completions: Completion): return [x.text for x in completions.choices] -'''Logprobs values are extracted as -List[Optional[List[Dict[str,float]]]], i.e.: - -For each :class:`SequenceGroup`... - ...if the completions API was invoked with a non-`None` `logprobs` argument: - ...for each token offset in a sequence... - ...store a mapping from str(token) -> logprob - ...else, if the completions API was invoked with `logprobs=None`: - ...store None -''' -LogprobType = List[Optional[List[Dict[str, float]]]] - - -def get_client_logprob_generations(completions: Completion) -> LogprobType: +def get_client_text_logprob_generations( + completions: Completion) -> List[TextTextLogprobs]: '''Operates on the output of a request made to an Open-AI-protocol completions endpoint; obtains top-rank logprobs for each token in each :class:`SequenceGroup` ''' - return [(None if x.logprobs is None else x.logprobs.top_logprobs) + text_generations = get_client_text_generations(completions) + text = ''.join(text_generations) + return [(text_generations, text, + (None if x.logprobs is None else x.logprobs.top_logprobs)) for x in completions.choices] diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 3c6a1183e71b7..4a6062aecd62d 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -18,7 +18,7 @@ SamplingMetadata, get_logprobs, get_pythonized_sample_results) from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - SequenceGroupMetadata, SequenceOutput, Logprob) + Logprob, SequenceGroupMetadata, SequenceOutput) from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( From 419659d9de50e56e91901418f469a5ad841d3d62 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 07:12:33 -0400 Subject: [PATCH 29/47] multi-step output processing; formatting --- vllm/engine/output_processor/multi_step.py | 15 ++++- vllm/engine/output_processor/single_step.py | 65 +++++++++++++++------ 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 49a33ded5fcaa..5717b5eb7f208 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -4,6 +4,8 @@ from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.interfaces import ( SequenceGroupOutputProcessor) +from vllm.engine.output_processor.single_step import ( + single_step_process_prompt_logprob) from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.sampling_params import SamplingParams @@ -46,9 +48,16 @@ def __init__( def process_prompt_logprob(self, seq_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: - # TODO(sang): Prompt logprob currently not implemented in multi step - # workers. - self._log_prompt_logprob_unsupported_warning_once() + """Process prompt logprobs associated with each step of a multi-step- + scheduled computation. + + Args: + seq_group: the output is associated with this :class:`SequenceGroup` + outputs: the :class:`SequenceGroupOutput`s for all scheduler steps + """ + for output in outputs: + # Concatenate single-step prompt logprob processing results. + single_step_process_prompt_logprob(self, seq_group, output) @staticmethod @functools.lru_cache() diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 4b0c3f37a5e21..422e6d30522f5 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -15,6 +15,44 @@ logger = init_logger(__name__) +def single_step_process_prompt_logprob( + sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, + output: SequenceGroupOutput) -> None: + """Process prompt logprobs associated with the :class:`SequenceGroupOutput` + for a given step. + + Do nothing if the output has no prompt logprobs. + + Account for the fact that transformers do not compute first-token logprobs. + + Args: + sg_output_proc: :class:`SequenceGroupOutputProcessor` instance + seq_group: the output is associated with this :class:`SequenceGroup` + output: the :class:`SequenceGroupOutput` for a single scheduler step + """ + prompt_logprobs = output.prompt_logprobs + + # If this is the first (or only) "chunk" of the prefill, we need + # to prepend None to the list of prompt logprobs. The reason for this + # is that for N prompt tokens, the Sampler will generate N-1 total + # prompt logprobs during prefill since the token at idx 0 will not + # have a logprob associated with it. + if prompt_logprobs is not None: + if not seq_group.prompt_logprobs: + prompt_logprobs = [None] + prompt_logprobs + seq_group.prompt_logprobs = [] + + assert hasattr(sg_output_proc, 'detokenizer') + if (seq_group.sampling_params.detokenize + and sg_output_proc.detokenizer): + sg_output_proc.detokenizer.decode_prompt_logprobs_inplace( + seq_group, + prompt_logprobs, + position_offset=len(seq_group.prompt_logprobs)) + + seq_group.prompt_logprobs.extend(prompt_logprobs) + + class SingleStepOutputProcessor(SequenceGroupOutputProcessor): """SequenceGroupOutputProcessor which handles "output processing" logic, which happens after the model returns generated token ids and before @@ -60,27 +98,16 @@ def process_outputs(self, sequence_group: SequenceGroup, def process_prompt_logprob(self, seq_group: SequenceGroup, outputs: List[SequenceGroupOutput]) -> None: + """Process prompt logprobs associated with one step of a single-step- + scheduled computation. + + Args: + seq_group: the output is associated with this :class:`SequenceGroup` + output: the :class:`SequenceGroupOutput` for a single scheduler step + """ assert len(outputs) == 1, ("Single step should only has 1 output.") output = outputs[0] - prompt_logprobs = output.prompt_logprobs - - # If this is the first (or only) "chunk" of the prefill, we need - # to prepend None to the list of prompt logprobs. The reason for this - # is that for N prompt tokens, the Sampler will generate N-1 total - # prompt logprobs during prefill since the token at idx 0 will not - # have a logprob associated with it. - if prompt_logprobs is not None: - if not seq_group.prompt_logprobs: - prompt_logprobs = [None] + prompt_logprobs - seq_group.prompt_logprobs = [] - - if seq_group.sampling_params.detokenize and self.detokenizer: - self.detokenizer.decode_prompt_logprobs_inplace( - seq_group, - prompt_logprobs, - position_offset=len(seq_group.prompt_logprobs)) - - seq_group.prompt_logprobs.extend(prompt_logprobs) + single_step_process_prompt_logprob(self, seq_group, output) def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput, From 55eaab90cc2fc500bd1acc9e4ec9e4568613bd6f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 07:25:34 -0400 Subject: [PATCH 30/47] wip --- tests/models/utils.py | 4 ++-- tests/multi_step/test_correctness_async_llm.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index 2e7ab51ec4146..93ec03995094b 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -38,7 +38,7 @@ def check_outputs_equal( float]], SampleLogprobs]]] -# OpenAI-API-style logprobs +# Allow for tokens to be represented as str's rather than IDs TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], List[Dict[str, Logprob]]]]] @@ -53,7 +53,7 @@ def check_logprobs_close( num_outputs_0_skip_tokens: int = 0, warn_on_mismatch: bool = True, always_check_logprobs: bool = False, -): +) -> None: """Compare the logprobs of two sequences generated by different models, which should be similar but not necessarily equal. diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 21ea5ce3af776..ab05bcdb55743 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -101,10 +101,9 @@ async def test_multi_step(example_prompts, model: str, tp_size: int, # to single-step scheduling. ref_text_logprobs = get_client_text_logprob_generations(ref_completions) test_text_logprobs = get_client_text_logprob_generations(test_completions) - check_logprobs_close( outputs_0_lst=ref_text_logprobs, outputs_1_lst=test_text_logprobs, name_0="hf", name_1="vllm", - ) \ No newline at end of file + ) From bae1fb95ad4ecb092fbda49751e3df1dc83b17bf Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 07:37:03 -0400 Subject: [PATCH 31/47] small fixes --- tests/utils.py | 3 ++- vllm/engine/output_processor/multi_step.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 58efb91ce78dc..850648b3e597f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -449,6 +449,7 @@ async def completions_with_server_args( prompts: test prompts model_name: model to spin up on the vLLM server server_cli_args: CLI args for starting the server + num_logprobs: Number of logprobs to report (or `None`) Returns: OpenAI Completion instance @@ -468,7 +469,7 @@ async def completions_with_server_args( return outputs -def get_client_text_generations(completions: Completion): +def get_client_text_generations(completions: Completion) -> List[str]: '''Extract generated tokens from the output of a request made to an Open-AI-protocol completions endpoint. ''' diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 5717b5eb7f208..0209b0adc9831 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -52,7 +52,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, scheduled computation. Args: - seq_group: the output is associated with this :class:`SequenceGroup` + seq_group: the outputs are associated with this :class:`SequenceGroup` outputs: the :class:`SequenceGroupOutput`s for all scheduler steps """ for output in outputs: From 4c0c9f8ee1d14c2f5bf2b6a823447ed264265665 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 10:22:16 -0400 Subject: [PATCH 32/47] Added prompt logprobs tests --- tests/conftest.py | 81 +++++++++++++++--------- tests/models/utils.py | 70 ++++++++++++++++++-- tests/multi_step/test_correctness_llm.py | 64 +++++++++++++++++++ 3 files changed, 179 insertions(+), 36 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ae362b228d9d8..a6289cccc54aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,8 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding, BatchFeature) +from tests.models.utils import (TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs) from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.config import TokenizerPoolConfig @@ -31,7 +33,6 @@ to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.sequence import SampleLogprobs from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, identity, is_cpu) @@ -426,7 +427,7 @@ def generate_greedy_logprobs_limit( images: Optional[List[Image.Image]] = None, audios: Optional[List[Tuple[np.ndarray, int]]] = None, **kwargs: Any, - ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]: + ) -> List[TokensTextLogprobs]: all_logprobs: List[List[Dict[int, float]]] = [] all_output_ids: List[List[int]] = [] all_output_strs: List[str] = [] @@ -480,7 +481,7 @@ def generate_encoder_decoder_greedy_logprobs_limit( max_tokens: int, num_logprobs: int, **kwargs: Any, - ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]: + ) -> List[TokensTextLogprobs]: ''' Greedy logprobs generation for vLLM encoder/decoder models ''' @@ -609,14 +610,16 @@ def generate( def _final_steps_generate_w_logprobs( self, req_outputs: List[RequestOutput], - ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: - outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = [] + ) -> List[TokensTextLogprobsPromptLogprobs]: + outputs: List[TokensTextLogprobsPromptLogprobs] = [] for req_output in req_outputs: + assert len(req_output.outputs) > 0 for sample in req_output.outputs: output_str = sample.text output_ids = list(sample.token_ids) output_logprobs = sample.logprobs - outputs.append((output_ids, output_str, output_logprobs)) + outputs.append((output_ids, output_str, output_logprobs, + req_output.prompt_logprobs)) return outputs def generate_w_logprobs( @@ -627,7 +630,8 @@ def generate_w_logprobs( List[List[Image.Image]]]] = None, audios: Optional[Union[List[Tuple[np.ndarray, int]], List[List[Tuple[np.ndarray, int]]]]] = None - ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: assert sampling_params.logprobs is not None if images is not None: @@ -644,13 +648,20 @@ def generate_w_logprobs( req_outputs = self.model.generate(inputs, sampling_params=sampling_params) - return self._final_steps_generate_w_logprobs(req_outputs) + + toks_str_logsprobs_prompt_logprobs = ( + self._final_steps_generate_w_logprobs(req_outputs)) + # Omit prompt logprobs if not required by sampling params + return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] + if sampling_params.prompt_logprobs is None else + toks_str_logsprobs_prompt_logprobs) def generate_encoder_decoder_w_logprobs( self, encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], sampling_params: SamplingParams, - ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: ''' Logprobs generation for vLLM encoder/decoder models ''' @@ -658,7 +669,12 @@ def generate_encoder_decoder_w_logprobs( assert sampling_params.logprobs is not None req_outputs = self.model.generate(encoder_decoder_prompts, sampling_params=sampling_params) - return self._final_steps_generate_w_logprobs(req_outputs) + toks_str_logsprobs_prompt_logprobs = ( + self._final_steps_generate_w_logprobs(req_outputs)) + # Omit prompt logprobs if not required by sampling params + return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] + if sampling_params.prompt_logprobs is None else + toks_str_logsprobs_prompt_logprobs) def generate_greedy( self, @@ -676,44 +692,47 @@ def generate_greedy_logprobs( prompts: List[str], max_tokens: int, num_logprobs: int, + num_prompt_logprobs: Optional[int] = None, images: Optional[Union[List[Image.Image], List[List[Image.Image]]]] = None, audios: Optional[Union[List[Tuple[np.ndarray, int]], List[List[Tuple[np.ndarray, int]]]]] = None, stop_token_ids: Optional[List[int]] = None, - ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: - greedy_logprobs_params = SamplingParams(temperature=0.0, - max_tokens=max_tokens, - logprobs=num_logprobs, - stop_token_ids=stop_token_ids) - outputs = self.generate_w_logprobs(prompts, - greedy_logprobs_params, - images=images, - audios=audios) - - return [(output_ids, output_str, output_logprobs) - for output_ids, output_str, output_logprobs in outputs] + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + greedy_logprobs_params = SamplingParams( + temperature=0.0, + max_tokens=max_tokens, + logprobs=num_logprobs, + prompt_logprobs=(num_prompt_logprobs), + stop_token_ids=stop_token_ids) + return self.generate_w_logprobs(prompts, + greedy_logprobs_params, + images=images, + audios=audios) def generate_encoder_decoder_greedy_logprobs( self, encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, - ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: - greedy_logprobs_params = SamplingParams(temperature=0.0, - use_beam_search=False, - max_tokens=max_tokens, - logprobs=num_logprobs) + num_prompt_logprobs: Optional[int] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + greedy_logprobs_params = SamplingParams( + temperature=0.0, + use_beam_search=False, + max_tokens=max_tokens, + logprobs=num_logprobs, + prompt_logprobs=(num_prompt_logprobs), + ) ''' Greedy logprobs generation for vLLM encoder/decoder models ''' - outputs = self.generate_encoder_decoder_w_logprobs( + return self.generate_encoder_decoder_w_logprobs( encoder_decoder_prompts, greedy_logprobs_params) - return [(output_ids, output_str, output_logprobs) - for output_ids, output_str, output_logprobs in outputs] - def generate_beam_search( self, prompts: List[str], diff --git a/tests/models/utils.py b/tests/models/utils.py index 93ec03995094b..e3322f563b50f 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,7 +1,7 @@ import warnings from typing import Dict, List, Optional, Sequence, Tuple, Union -from vllm.sequence import Logprob, SampleLogprobs +from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs TokensText = Tuple[List[int], str] @@ -38,6 +38,10 @@ def check_outputs_equal( float]], SampleLogprobs]]] +TokensTextLogprobsPromptLogprobs = Tuple[ + List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], + Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] + # Allow for tokens to be represented as str's rather than IDs TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], List[Dict[str, @@ -46,8 +50,12 @@ def check_outputs_equal( def check_logprobs_close( *, - outputs_0_lst: Sequence[Union[TokensTextLogprobs, TextTextLogprobs]], - outputs_1_lst: Sequence[Union[TokensTextLogprobs, TextTextLogprobs]], + outputs_0_lst: Sequence[Union[TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + TextTextLogprobs]], + outputs_1_lst: Sequence[Union[TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + TextTextLogprobs]], name_0: str, name_1: str, num_outputs_0_skip_tokens: int = 0, @@ -78,8 +86,60 @@ def check_logprobs_close( for prompt_idx, (outputs_0, outputs_1) in enumerate(zip(outputs_0_lst, outputs_1_lst)): - output_ids_0, output_str_0, logprobs_0 = outputs_0 - output_ids_1, output_str_1, logprobs_1 = outputs_1 + if len(outputs_0) == 3: + assert len(outputs_1) == 3 + # Break out tokens, text & sample logprobs + # (prompt logprobs were not provided) + output_ids_0, output_str_0, logprobs_0 = outputs_0 + output_ids_1, output_str_1, logprobs_1 = outputs_1 + elif len(outputs_0) == 4: + assert len(outputs_1) == 4 + # Break out tokens, text, sample logprobs & prompt logprobs + ( + output_ids_0, + output_str_0, + logprobs_0, + prompt_logprobs_0, + ) = outputs_0 + ( + output_ids_1, + output_str_1, + logprobs_1, + prompt_logprobs_1, + ) = outputs_1 + + # Test prompt logprobs closeness + if (prompt_logprobs_0 is not None + and prompt_logprobs_1 is not None): + # For each token's logprobs + for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate( + zip(prompt_logprobs_0, prompt_logprobs_1)): + fail_msg = ( + f"Prompt logprobs test:" + f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}" + f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}") + + if logprobs_elem_0 is None: + # If either set of token logprobs is `None`, + # both must be + assert logprobs_elem_1 is None, fail_msg + else: + # Top-k token choices must be the same + assert logprobs_elem_1 is not None, fail_msg + assert (set(logprobs_elem_0.keys()) == set( + logprobs_elem_1.keys())), fail_msg + else: + # Both sequence logprobs lists must be `None` + fail_msg = (f"Prompt logprobs test:" + f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}" + f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}") + + assert (prompt_logprobs_0 is None + and prompt_logprobs_1 is None), fail_msg + else: + raise ValueError(f"Outputs tuple must have 3 or 4 elements but " + f"{len(outputs_0)} elements were provided: " + f"{outputs_0}") if logprobs_0 is None: logprobs_0 = [None] * len(output_ids_0) diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 6aa6ec0fbb64e..03bcd0843c3b5 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -75,3 +75,67 @@ def test_multi_step_llm( name_0="hf", name_1="vllm", ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("tp_size", [1]) +@pytest.mark.parametrize("max_tokens", [5]) +@pytest.mark.parametrize("enforce_eager", [True]) +@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) +@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) +@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)]) +def test_multi_step_llm_w_prompt_logprobs( + vllm_runner, + example_prompts, + model: str, + dtype: str, + tp_size: int, + max_tokens: int, + enforce_eager: int, + num_scheduler_steps: int, + num_prompts: int, + num_logprobs: Optional[int], + num_prompt_logprobs: Optional[int], +) -> None: + + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts + + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + use_v2_block_manager=True, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) + + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + ) as vllm_model: + single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) + + check_logprobs_close( + outputs_0_lst=single_step_vllm_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) From 8865bbdec5530624eb8ba64c1b4994ead8d88e62 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 10:45:53 -0400 Subject: [PATCH 33/47] wip --- tests/models/utils.py | 8 +--- tests/multi_step/test_correctness_llm.py | 50 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index e3322f563b50f..4d5ce1d7002f3 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -60,7 +60,6 @@ def check_logprobs_close( name_1: str, num_outputs_0_skip_tokens: int = 0, warn_on_mismatch: bool = True, - always_check_logprobs: bool = False, ) -> None: """Compare the logprobs of two sequences generated by different models, which should be similar but not necessarily equal. @@ -78,7 +77,6 @@ def check_logprobs_close( num_outputs_0_skip_tokens warn_on_mismatch: Issue a warning if there is token-wise or text-wise mismatch between the two sequences - always_check_logprobs: If true, check logprobs even when tokens match """ assert len(outputs_0_lst) == len(outputs_1_lst) @@ -159,12 +157,10 @@ def check_logprobs_close( for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): - is_tok_mismatch = output_id_0 != output_id_1 - # If generated tokens don't match # or it is desired to always check logprobs, # then - if is_tok_mismatch or always_check_logprobs: + if output_id_0 != output_id_1: logprobs_elem_0 = logprobs_0[idx] logprobs_elem_1 = logprobs_1[idx] @@ -180,7 +176,7 @@ def check_logprobs_close( assert output_id_0 in logprobs_elem_1, fail_msg assert output_id_1 in logprobs_elem_0, fail_msg - if warn_on_mismatch and is_tok_mismatch: + if warn_on_mismatch: with warnings.catch_warnings(): # This ensures that repeated warnings are shown # in the output, not just the first occurrence diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 03bcd0843c3b5..e184a54e24778 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -34,6 +34,31 @@ def test_multi_step_llm( num_prompts: int, num_logprobs: Optional[int], ) -> None: + """Test vLLM engine with multi-step scheduling via sync LLM Engine. + + Set up a HuggingFace (HF) transformers model as a ground-truth reference. + + Prompt them with the same example prompts. + + Validate: + * Generated tokens match + * Generated logprobs are all very close + + Args: + hf_runner: HF transformers model runner fixture + vllm_runner: vLLM model runner fixture + example_prompts: test fixture providing example prompts + model: model under test (same for single- and multi-step engines) + dtype: tensor datatype for engine to utilize + tp_size: degree of tensor-parallelism + max_tokens: the maximum number of tokens to generate + enforce_eager + num_scheduler_steps: for multi-step scheduling, GPU-side steps per + GPU -> CPU output transfer + num_prompts: number of example prompts under test + num_logprobs: corresponds to the `logprobs` argument to the OpenAI + completions endpoint; `None` -> no logprobs + """ prompts = example_prompts if len(prompts) < num_prompts: @@ -98,6 +123,31 @@ def test_multi_step_llm_w_prompt_logprobs( num_logprobs: Optional[int], num_prompt_logprobs: Optional[int], ) -> None: + """Test vLLM engine with multi-step scheduling via sync LLM Engine. + + Set up a vLLM engine instance w/ single-step scheduling as a ground-truth + reference. + + Prompt them with the same example prompts. + + Validate: + * Generated logprobs are all very close + + Args: + hf_runner: HF transformers model runner fixture + vllm_runner: vLLM model runner fixture + example_prompts: test fixture providing example prompts + model: model under test (same for single- and multi-step engines) + dtype: tensor datatype for engine to utilize + tp_size: degree of tensor-parallelism + max_tokens: the maximum number of tokens to generate + enforce_eager + num_scheduler_steps: for multi-step scheduling, GPU-side steps per + GPU -> CPU output transfer + num_prompts: number of example prompts under test + num_logprobs: corresponds to the `logprobs` argument to the OpenAI + completions endpoint; `None` -> no logprobs + """ prompts = example_prompts if len(prompts) < num_prompts: From e05670b91f851197fd8d451aa3e5133278c8f466 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 12:44:02 -0400 Subject: [PATCH 34/47] increased max wait time --- tests/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 850648b3e597f..5218def4a24f0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -456,7 +456,8 @@ async def completions_with_server_args( ''' outputs = None - with RemoteOpenAIServer(model_name, server_cli_args) as server: + max_wait_seconds = 240*3 # 240 is default + with RemoteOpenAIServer(model_name, server_cli_args, max_wait_seconds=max_wait_seconds) as server: client = server.get_async_client() outputs = await client.completions.create(model=model_name, prompt=prompts, From 42af633dbb336dd8a2f85da69651bf58e3e057ad Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 27 Aug 2024 12:52:15 -0400 Subject: [PATCH 35/47] formatting --- tests/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 5218def4a24f0..f8882e7bff69c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -456,8 +456,10 @@ async def completions_with_server_args( ''' outputs = None - max_wait_seconds = 240*3 # 240 is default - with RemoteOpenAIServer(model_name, server_cli_args, max_wait_seconds=max_wait_seconds) as server: + max_wait_seconds = 240 * 3 # 240 is default + with RemoteOpenAIServer(model_name, + server_cli_args, + max_wait_seconds=max_wait_seconds) as server: client = server.get_async_client() outputs = await client.completions.create(model=model_name, prompt=prompts, From ad7f2615b46012db7f3264f5ed1b1f3af15ac803 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 13 Sep 2024 08:42:45 -0400 Subject: [PATCH 36/47] seems to be passing tests --- vllm/worker/multi_step_model_runner.py | 51 ++++++++++++++++---------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index b900eb5a610ff..ad187cd37625d 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -614,34 +614,45 @@ def _pythonize_sampler_output( frozen_model_input = model_input.frozen_model_input assert frozen_model_input.sampling_metadata is not None + sampling_metadata = frozen_model_input.sampling_metadata # samples generation should have been skipped assert not output.outputs pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries] - # CPU GPU sync - pinned_buffer = pinned_buffer.copy_(sampled_token_ids, non_blocking=False) - - # this will not block as the tensors are already on CPU - samples_list = pinned_buffer.tolist() - - sampling_metadata = frozen_model_input.sampling_metadata - - skip_sampler_cpu_output = ( - frozen_model_input.sampling_metadata.skip_sampler_cpu_output) - # We are guaranteed output tensors are ready, so it is safe to # pythonize the sampler output & obtain CPU-side logprobs. # # However this computation may be skipped entirely # if no pythonization was deferred. seq_groups = sampling_metadata.seq_groups - logprobs_are_requested = any([ - sg.sampling_params.logprobs is not None - or sg.sampling_params.prompt_logprobs is not None for sg in seq_groups + prompt_logprobs_are_requested_for_prefill = any([ + ((sg.sampling_params.prompt_logprobs is not None) and sg.is_prompt) + for sg in seq_groups ]) + any_logprobs_are_requested = any( + [sg.sampling_params.logprobs is not None + for sg in seq_groups]) or prompt_logprobs_are_requested_for_prefill + + if prompt_logprobs_are_requested_for_prefill: + # CPU GPU sync + sample_idx_tensor = torch.tensor( + [sdx for sg in seq_groups for sdx in sg.sample_indices]) + pinned_buffer = pinned_buffer.copy_( + sampled_token_ids[sample_idx_tensor, :], non_blocking=False) + else: + # CPU GPU sync + pinned_buffer = pinned_buffer.copy_(sampled_token_ids, + non_blocking=False) + + # this will not block as the tensors are already on CPU + samples_list = pinned_buffer.tolist() + + skip_sampler_cpu_output = ( + frozen_model_input.sampling_metadata.skip_sampler_cpu_output) + do_pythonize_logprobs = (skip_sampler_cpu_output - and logprobs_are_requested) + and any_logprobs_are_requested) ( prompt_logprobs, sample_logprobs, @@ -666,7 +677,7 @@ def _pythonize_sampler_output( prompt_logprobs[sgdx], sample_logprobs[sgdx], ) - elif logprobs_are_requested: + elif any_logprobs_are_requested: ( group_prompt_logprobs, group_sample_logprobs, @@ -696,7 +707,7 @@ def _pythonize_sampler_output( seq_output.parent_seq_id = seq_ids[parent_id] seq_output.output_token = next_token_id - if logprobs_are_requested: + if any_logprobs_are_requested: seq_output.logprobs = group_sample_logprobs[tdx] else: logprobs = next(iter(seq_output.logprobs.values())) @@ -714,7 +725,7 @@ def _pythonize_sampler_output( seq_outputs.append( SequenceOutput(seq_ids[parent_id], next_token_id, (group_sample_logprobs[tdx] - if logprobs_are_requested else { + if any_logprobs_are_requested else { next_token_id: Logprob(logprob=float('inf'), rank=None, @@ -722,12 +733,12 @@ def _pythonize_sampler_output( }))) if cache is not None: completion_seq_group_output.prompt_logprobs = \ - group_prompt_logprobs if logprobs_are_requested else None + group_prompt_logprobs if any_logprobs_are_requested else None output.outputs.append(completion_seq_group_output) else: output.outputs.append( CompletionSequenceGroupOutput( seq_outputs, (group_prompt_logprobs - if logprobs_are_requested else None))) + if any_logprobs_are_requested else None))) assert len(output.outputs) > 0 From ac4b36f6a0c5c81f3de31931095d9372052ab35c Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 13 Sep 2024 08:56:40 -0400 Subject: [PATCH 37/47] comments --- vllm/worker/multi_step_model_runner.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index ad187cd37625d..563d45748bc49 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -623,8 +623,21 @@ def _pythonize_sampler_output( # We are guaranteed output tensors are ready, so it is safe to # pythonize the sampler output & obtain CPU-side logprobs. # - # However this computation may be skipped entirely - # if no pythonization was deferred. + # However we should check whether logprobs pythonization may + # be skipped entirely, i.e. because no logprobs were requested + # or pythonization was not deferred. To that end, + # + # * `prompt_logprobs_are_requested_for_prefill` signals that + # there are *any* prefill-phase `SequenceGroup`s for which + # prompt logprobs were requested + # + # * `any_logprobs_are_requested` signals that there are any + # `SequenceGroup`s requesting (1) sample logprobs or (2) + # prompt logprobs in prefill phase. + # + # Later on, these flags cause adjustments to the pythonization + # process to accommodate logprobs. + seq_groups = sampling_metadata.seq_groups prompt_logprobs_are_requested_for_prefill = any([ ((sg.sampling_params.prompt_logprobs is not None) and sg.is_prompt) @@ -635,7 +648,9 @@ def _pythonize_sampler_output( for sg in seq_groups]) or prompt_logprobs_are_requested_for_prefill if prompt_logprobs_are_requested_for_prefill: - # CPU GPU sync + # CPU GPU sync, after gathering *only* sampled tokens (since + # requesting prompt logprobs leads `sampled_token_ids` to + # include prompt token ids in addition to sampled token ids.) sample_idx_tensor = torch.tensor( [sdx for sg in seq_groups for sdx in sg.sample_indices]) pinned_buffer = pinned_buffer.copy_( @@ -651,6 +666,11 @@ def _pythonize_sampler_output( skip_sampler_cpu_output = ( frozen_model_input.sampling_metadata.skip_sampler_cpu_output) + # *Don't* skip logprobs pythonization *if*: + # * Any requests require logprobs to be returned in this + # iteration AND + # * These requests are being scheduled in a fashion which + # defers pythonization (i.e. multi-step scheduling.) do_pythonize_logprobs = (skip_sampler_cpu_output and any_logprobs_are_requested) ( From dcad218a019b2899c9bdbf352e2a3da7681cb227 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 13 Sep 2024 09:13:31 -0400 Subject: [PATCH 38/47] refactoring --- tests/models/utils.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index e3322f563b50f..43f949a3f7004 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -34,19 +34,38 @@ def check_outputs_equal( assert output_ids_0 == output_ids_1, fail_msg +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * List of top sample logprobs for each sampled token +# +# Assumes prompt logprobs were not requested. TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]]] -TokensTextLogprobsPromptLogprobs = Tuple[ - List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], - Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] - -# Allow for tokens to be represented as str's rather than IDs +# Allow for tokens to be represented as str's rather than IDs; +# tuple of +# * Token string representations list +# * String +# * Optional list of top sample logprobs for each sampled token +# +# Assumes prompt logprobs were not requested. TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], List[Dict[str, Logprob]]]]] +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * Optional list of top sample logprobs for each sampled token +# * Optional list of top prompt logprobs for each prompt token +# +# Allows prompt logprobs to be requested. +TokensTextLogprobsPromptLogprobs = Tuple[ + List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], + Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] + def check_logprobs_close( *, From 0f373ab39307e52b827d12f116cfd7ae34f328a3 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 13 Sep 2024 09:20:43 -0400 Subject: [PATCH 39/47] comment --- vllm/worker/multi_step_model_runner.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 563d45748bc49..9c80c57ba6b4d 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -628,12 +628,13 @@ def _pythonize_sampler_output( # or pythonization was not deferred. To that end, # # * `prompt_logprobs_are_requested_for_prefill` signals that - # there are *any* prefill-phase `SequenceGroup`s for which - # prompt logprobs were requested + # there are *any* prefill-phase requests which specify that + # prompt logprobs should be returned. # # * `any_logprobs_are_requested` signals that there are any - # `SequenceGroup`s requesting (1) sample logprobs or (2) - # prompt logprobs in prefill phase. + # requests which (1) specify that sample logprobs should be + # returned, or (2) are in the prefill phase AND specify that + # prompt logprobs should be returned. # # Later on, these flags cause adjustments to the pythonization # process to accommodate logprobs. From 9bff9b661e01c14fe2d66b88880df2114003e087 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 13 Sep 2024 09:28:12 -0400 Subject: [PATCH 40/47] updated prompt logprobs test comment --- tests/multi_step/test_correctness_llm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index e1ea5c82774cb..c5dc81cc25622 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -123,7 +123,7 @@ def test_multi_step_llm_w_prompt_logprobs( num_logprobs: Optional[int], num_prompt_logprobs: Optional[int], ) -> None: - """Test vLLM engine with multi-step scheduling via sync LLM Engine. + """Test prompt logprobs with multi-step scheduling via sync LLM Engine. Set up a vLLM engine instance w/ single-step scheduling as a ground-truth reference. @@ -131,7 +131,7 @@ def test_multi_step_llm_w_prompt_logprobs( Prompt them with the same example prompts. Validate: - * Generated logprobs are all very close + * All generated logprobs are all very close Args: hf_runner: HF transformers model runner fixture @@ -147,6 +147,9 @@ def test_multi_step_llm_w_prompt_logprobs( num_prompts: number of example prompts under test num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> no logprobs + num_prompt_logprobs: number of logprobs to return for each prompt token; + note that this argument is not supported by the + OpenAI completions endpoint. """ prompts = example_prompts From 1a28003d69b0bc5c664f4af672eba0229897b4db Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 13 Sep 2024 09:40:15 -0400 Subject: [PATCH 41/47] updated check_logprobs_close() comment --- tests/models/utils.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index 43f949a3f7004..c7a80963e284d 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -84,6 +84,18 @@ def check_logprobs_close( """Compare the logprobs of two sequences generated by different models, which should be similar but not necessarily equal. + How sample logprobs are compared: + * `always_check_logprobs == True`: set of highest-logprob token ids + must match between seq0 and seq1 at all sampled token offsets + * `not always_check_logprobs == True`: highest-logprob token ids are + only compared at sampled token offsets for which generated token + ids don't match + + Prompt logprobs must be provided either for both input sequences, or + for neither. If prompt logprobs are provided, then highest-logprob + prompt token ids must match between seq0 and seq1 at all prompt token + offsets. + Args: outputs_0_lst: First sequence to compare outputs_0_lst: Second sequence to compare @@ -130,7 +142,9 @@ def check_logprobs_close( # Test prompt logprobs closeness if (prompt_logprobs_0 is not None and prompt_logprobs_1 is not None): - # For each token's logprobs + # Both sequences' prompt logprobs lists are not `None`` + # (although individual list elements may be `None`); + # for each token's logprobs: for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate( zip(prompt_logprobs_0, prompt_logprobs_1)): fail_msg = ( @@ -139,12 +153,14 @@ def check_logprobs_close( f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}") if logprobs_elem_0 is None: - # If either set of token logprobs is `None`, - # both must be + # If the seq 0 token's logprobs are `None`, + # the seq 1 token's logprobs must be `None` assert logprobs_elem_1 is None, fail_msg else: - # Top-k token choices must be the same + # If the seq 0 token's logprobs are not `None`, + # the seq 1 token's logprobs must not be `None` assert logprobs_elem_1 is not None, fail_msg + # Logprobs check: top-k token choices must be the same assert (set(logprobs_elem_0.keys()) == set( logprobs_elem_1.keys())), fail_msg else: From c9d95377d4162a5de5c39c812ec2afab39ef01de Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 13 Sep 2024 09:42:05 -0400 Subject: [PATCH 42/47] small fix --- tests/models/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index c7a80963e284d..606f695919535 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -87,7 +87,7 @@ def check_logprobs_close( How sample logprobs are compared: * `always_check_logprobs == True`: set of highest-logprob token ids must match between seq0 and seq1 at all sampled token offsets - * `not always_check_logprobs == True`: highest-logprob token ids are + * `always_check_logprobs == False`: highest-logprob token ids are only compared at sampled token offsets for which generated token ids don't match From 5d64bf344bd4930c95dc8b1e6f0ea0a9731de14a Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:44:26 -0400 Subject: [PATCH 43/47] Update tests/models/utils.py Co-authored-by: Cody Yu --- tests/models/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index 606f695919535..8cba23edf6b51 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -117,14 +117,13 @@ def check_logprobs_close( for prompt_idx, (outputs_0, outputs_1) in enumerate(zip(outputs_0_lst, outputs_1_lst)): + assert len(output_0) == len(output_1) if len(outputs_0) == 3: - assert len(outputs_1) == 3 # Break out tokens, text & sample logprobs # (prompt logprobs were not provided) output_ids_0, output_str_0, logprobs_0 = outputs_0 output_ids_1, output_str_1, logprobs_1 = outputs_1 elif len(outputs_0) == 4: - assert len(outputs_1) == 4 # Break out tokens, text, sample logprobs & prompt logprobs ( output_ids_0, From c1524461669c1a41268fba22901945c19b672cec Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:44:43 -0400 Subject: [PATCH 44/47] Update vllm/worker/multi_step_model_runner.py Co-authored-by: Cody Yu --- vllm/worker/multi_step_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 9c80c57ba6b4d..7a4c23e6f890a 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -620,7 +620,7 @@ def _pythonize_sampler_output( pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries] - # We are guaranteed output tensors are ready, so it is safe to + # We guarantee output tensors are ready, so it is safe to # pythonize the sampler output & obtain CPU-side logprobs. # # However we should check whether logprobs pythonization may From 81972195e7d2383a7fc88b8a9065c39f5a0a6d00 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:45:46 -0400 Subject: [PATCH 45/47] Update vllm/worker/multi_step_model_runner.py Co-authored-by: Cody Yu --- vllm/worker/multi_step_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 7a4c23e6f890a..5e1ee07af0fa8 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -644,9 +644,9 @@ def _pythonize_sampler_output( ((sg.sampling_params.prompt_logprobs is not None) and sg.is_prompt) for sg in seq_groups ]) - any_logprobs_are_requested = any( + any_logprobs_are_requested = prompt_logprobs_are_requested_for_prefill or any( [sg.sampling_params.logprobs is not None - for sg in seq_groups]) or prompt_logprobs_are_requested_for_prefill + for sg in seq_groups]) if prompt_logprobs_are_requested_for_prefill: # CPU GPU sync, after gathering *only* sampled tokens (since From 908709cbbe62f9e8fdc8c43e68160b7b2169f5b3 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:04:11 -0400 Subject: [PATCH 46/47] Update vllm/worker/multi_step_model_runner.py Co-authored-by: Cody Yu --- vllm/worker/multi_step_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 5e1ee07af0fa8..339b76f80e7dd 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -641,7 +641,7 @@ def _pythonize_sampler_output( seq_groups = sampling_metadata.seq_groups prompt_logprobs_are_requested_for_prefill = any([ - ((sg.sampling_params.prompt_logprobs is not None) and sg.is_prompt) + sg.sampling_params.prompt_logprobs is not None and sg.is_prompt for sg in seq_groups ]) any_logprobs_are_requested = prompt_logprobs_are_requested_for_prefill or any( From cce5394678c1e4d06d524083361fe69a6c137195 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 17 Sep 2024 14:28:25 -0400 Subject: [PATCH 47/47] addressing feedback --- tests/models/utils.py | 4 +++- tests/utils.py | 2 +- vllm/worker/multi_step_model_runner.py | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index 8cba23edf6b51..8e31a1d6eefed 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -117,13 +117,15 @@ def check_logprobs_close( for prompt_idx, (outputs_0, outputs_1) in enumerate(zip(outputs_0_lst, outputs_1_lst)): - assert len(output_0) == len(output_1) + assert len(outputs_0) == len(outputs_1) if len(outputs_0) == 3: + assert len(outputs_1) == 3 # Break out tokens, text & sample logprobs # (prompt logprobs were not provided) output_ids_0, output_str_0, logprobs_0 = outputs_0 output_ids_1, output_str_1, logprobs_1 = outputs_1 elif len(outputs_0) == 4: + assert len(outputs_1) == 4 # Break out tokens, text, sample logprobs & prompt logprobs ( output_ids_0, diff --git a/tests/utils.py b/tests/utils.py index af46e09f93429..766d70c1ae9e9 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -486,7 +486,7 @@ async def completions_with_server_args( stream=False, max_tokens=5, logprobs=num_logprobs) - assert outputs is not None + assert outputs is not None, "Completion API call failed." return outputs diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 339b76f80e7dd..ebcafbbab119a 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -644,9 +644,9 @@ def _pythonize_sampler_output( sg.sampling_params.prompt_logprobs is not None and sg.is_prompt for sg in seq_groups ]) - any_logprobs_are_requested = prompt_logprobs_are_requested_for_prefill or any( - [sg.sampling_params.logprobs is not None - for sg in seq_groups]) + any_logprobs_are_requested = ( + prompt_logprobs_are_requested_for_prefill + or any([sg.sampling_params.logprobs is not None for sg in seq_groups])) if prompt_logprobs_are_requested_for_prefill: # CPU GPU sync, after gathering *only* sampled tokens (since