From df5dafaa5ba611f7179720958ba63e49615c927f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 25 Jan 2025 03:45:20 +0800 Subject: [PATCH] [Misc] Remove deprecated code (#12383) Signed-off-by: DarkLight1337 --- tests/async_engine/test_api_server.py | 23 ++++++---- tests/basic_correctness/test_preemption.py | 18 ++++---- .../multi_step/test_correctness_async_llm.py | 3 +- vllm/config.py | 10 ----- vllm/engine/arg_utils.py | 6 --- vllm/engine/metrics.py | 43 ------------------- 6 files changed, 25 insertions(+), 78 deletions(-) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 83c71b5cf6eb7..91ac35dd67bbf 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict: @pytest.fixture -def api_server(tokenizer_pool_size: int, worker_use_ray: bool): +def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() commands = [ - sys.executable, "-u", - str(script_path), "--model", "facebook/opt-125m", "--host", - "127.0.0.1", "--tokenizer-pool-size", - str(tokenizer_pool_size) + sys.executable, + "-u", + str(script_path), + "--model", + "facebook/opt-125m", + "--host", + "127.0.0.1", + "--tokenizer-pool-size", + str(tokenizer_pool_size), + "--distributed-executor-backend", + distributed_executor_backend, ] - if worker_use_ray: - commands.append("--worker-use-ray") uvicorn_process = subprocess.Popen(commands) yield uvicorn_process.terminate() @pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) -@pytest.mark.parametrize("worker_use_ray", [False, True]) +@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) def test_api_server(api_server, tokenizer_pool_size: int, - worker_use_ray: bool): + distributed_executor_backend: str): """ Run the API server and test it. diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 4e502cfb5f4f8..4b27dcbc8609f 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -29,10 +29,10 @@ def check_settings(): @pytest.fixture -def worker_use_ray() -> bool: - # When SPMD worker is used, use ray_use_worker=True +def distributed_executor_backend() -> str: + # When SPMD worker is used, use distributed_executor_backend="ray" # to test delta input optimization works with preemption. - return envs.VLLM_USE_RAY_SPMD_WORKER + return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" @pytest.mark.parametrize("model", MODELS) @@ -47,7 +47,7 @@ def test_chunked_prefill_recompute( dtype: str, max_tokens: int, chunked_prefill_token_size: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Ensure that chunked prefill works with preemption.""" max_num_seqs = min(chunked_prefill_token_size, 256) @@ -66,7 +66,7 @@ def test_chunked_prefill_recompute( max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, max_num_seqs=max_num_seqs, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, disable_log_stats=False, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) @@ -93,7 +93,7 @@ def test_preemption( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """By default, recompute preemption is enabled""" @@ -104,7 +104,7 @@ def test_preemption( model, dtype=dtype, disable_log_stats=False, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt @@ -144,7 +144,7 @@ def test_preemption_infeasible( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Verify infeasible preemption request will be ignored.""" BLOCK_SIZE = 16 @@ -159,7 +159,7 @@ def test_preemption_infeasible( # ignored instead of hanging forever. num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 8456a463adeeb..b8524ed83026b 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -16,7 +16,8 @@ NUM_PROMPTS = [10] DEFAULT_SERVER_ARGS: List[str] = [ - "--worker-use-ray", + "--distributed-executor-backend", + "ray", "--gpu-memory-utilization", "0.85", "--swap-space", diff --git a/vllm/config.py b/vllm/config.py index efd81ad3de3b4..11c6f853b2b45 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1227,9 +1227,6 @@ class ParallelConfig: pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. tensor_parallel_size: int = 1 # Number of tensor parallel groups. - # Deprecated, use distributed_executor_backend instead. - worker_use_ray: Optional[bool] = None - # Maximum number of multiple batches # when load model sequentially. To avoid RAM OOM when using tensor # parallel and large models. @@ -1283,13 +1280,6 @@ def __post_init__(self) -> None: self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size - if self.worker_use_ray: - if self.distributed_executor_backend is None: - self.distributed_executor_backend = "ray" - elif not self.use_ray: - raise ValueError(f"worker-use-ray can't be used with " - f"distributed executor backend " - f"'{self.distributed_executor_backend}'.") ray_only_devices = ["tpu"] from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8f1b0bc5fd62e..f16e8e6df76bd 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -100,7 +100,6 @@ class EngineArgs: kv_cache_dtype: str = 'auto' seed: int = 0 max_model_len: Optional[int] = None - worker_use_ray: bool = False # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. @@ -389,10 +388,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' 'only supports Ray for distributed inference.') - parser.add_argument( - '--worker-use-ray', - action='store_true', - help='Deprecated, use ``--distributed-executor-backend=ray``.') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, @@ -1071,7 +1066,6 @@ def create_engine_config(self, parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, - worker_use_ray=self.worker_use_ray, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, tokenizer_pool_config=TokenizerPoolConfig.create_config( diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index f7ce21d0ae988..b771c190dd82a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -259,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): documentation="Number of emitted tokens.", labelnames=labelnames)) - # Deprecated in favor of vllm:prompt_tokens_total - self.gauge_avg_prompt_throughput = self._gauge_cls( - name="vllm:avg_prompt_throughput_toks_per_s", - documentation="Average prefill throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # Deprecated in favor of vllm:generation_tokens_total - self.gauge_avg_generation_throughput = self._gauge_cls( - name="vllm:avg_generation_throughput_toks_per_s", - documentation="Average generation throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # end-metrics-definitions @@ -635,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram(self.metrics.histogram_max_tokens_request, stats.max_tokens_requests) - def _log_prometheus_interval(self, prompt_throughput: float, - generation_throughput: float) -> None: - # Logs metrics to prometheus that are computed every logging_interval. - # Support legacy gauge metrics that make throughput calculations on - # the vLLM side. Moving forward, we should use counters like - # counter_prompt_tokens, counter_generation_tokens - # Which log raw data and calculate summaries using rate() on the - # grafana/prometheus side. See - # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 - self.metrics.gauge_avg_prompt_throughput.labels( - **self.labels).set(prompt_throughput) - self.metrics.gauge_avg_generation_throughput.labels( - **self.labels).set(generation_throughput) - def log(self, stats: Stats): """Logs to prometheus and tracked stats every iteration.""" # Log to prometheus. @@ -664,20 +635,6 @@ def log(self, stats: Stats): # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): - # Compute summary metrics for tracked stats (and log them - # to promethus if applicable). - prompt_throughput = get_throughput(self.num_prompt_tokens, - now=stats.now, - last_log=self.last_local_log) - generation_throughput = get_throughput( - self.num_generation_tokens, - now=stats.now, - last_log=self.last_local_log) - - self._log_prometheus_interval( - prompt_throughput=prompt_throughput, - generation_throughput=generation_throughput) - if self.spec_decode_metrics is not None: self._log_gauge( self.metrics.gauge_spec_decode_draft_acceptance_rate,