From 596d58cff148fb9f33c32ce0c6ab56879b3d9f84 Mon Sep 17 00:00:00 2001 From: Matt Wong <156021403+mawong-amd@users.noreply.github.com> Date: Fri, 28 Jun 2024 02:08:40 -0500 Subject: [PATCH] Revert "Merge branch 'main' of github.com:ROCm/vllm" (#72) This reverts commit 2a3cbf9d8adea6d4dca488dc6c502590ea81fad1, reversing changes made to 367aa5a8a25e94a388f5c188014f33274ce7e4b6. --- ...test_correctness.py_alt => test_correctness.py} | 0 ...w.py_alt => test_correctness_sliding_window.py} | 0 tests/metrics/test_metrics.py | 14 +++++--------- tests/worker/test_model_runner.py | 4 ++-- tests/worker/test_swap.py | 3 +-- 5 files changed, 8 insertions(+), 13 deletions(-) rename tests/core/block/e2e/{test_correctness.py_alt => test_correctness.py} (100%) rename tests/core/block/e2e/{test_correctness_sliding_window.py_alt => test_correctness_sliding_window.py} (100%) diff --git a/tests/core/block/e2e/test_correctness.py_alt b/tests/core/block/e2e/test_correctness.py similarity index 100% rename from tests/core/block/e2e/test_correctness.py_alt rename to tests/core/block/e2e/test_correctness.py diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py_alt b/tests/core/block/e2e/test_correctness_sliding_window.py similarity index 100% rename from tests/core/block/e2e/test_correctness_sliding_window.py_alt rename to tests/core/block/e2e/test_correctness_sliding_window.py diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 1112b5cf9148a..e0aa14f165c2d 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -26,7 +26,6 @@ def test_metric_counter_prompt_tokens( vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False, - enforce_eager=True, gpu_memory_utilization=0.4) tokenizer = vllm_model.model.get_tokenizer() prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] @@ -60,7 +59,6 @@ def test_metric_counter_generation_tokens( vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False, - enforce_eager=True, gpu_memory_utilization=0.4) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) tokenizer = vllm_model.model.get_tokenizer() @@ -90,7 +88,6 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False, - enforce_eager=True, gpu_memory_utilization=0.3, served_model_name=served_model_name) stat_logger = vllm_model.model.llm_engine.stat_logger @@ -108,7 +105,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, f"{served_model_name[0]!r}\n" f"actual: {metrics_tag_content!r}") -""" + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [4]) @@ -121,11 +118,11 @@ async def test_async_engine_log_metrics_regression( max_tokens: int, disable_log_stats: bool, ) -> None: - " "" + """ Regression test ensuring async engine generates metrics when disable_log_stats=False (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678) - " "" + """ engine_args = AsyncEngineArgs(model=model, dtype=dtype, disable_log_stats=disable_log_stats) @@ -144,7 +141,6 @@ async def test_async_engine_log_metrics_regression( len(example_prompts)) - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [4]) @@ -162,7 +158,7 @@ def test_engine_log_metrics_regression( engine = LLMEngine.from_engine_args(engine_args) for i, prompt in enumerate(example_prompts): engine.add_request( - f\"request-id-{i}\", + f"request-id-{i}", prompt, SamplingParams(max_tokens=max_tokens), ) @@ -170,7 +166,7 @@ def test_engine_log_metrics_regression( engine.step() assert_metrics(engine, disable_log_stats, len(example_prompts)) -""" + def assert_metrics(engine: LLMEngine, disable_log_stats: bool, num_requests: int) -> None: diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 734da4193a567..92de545acd53d 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -137,7 +137,7 @@ def test_prepare_prompt(batch_size): dtype=actual.dtype) torch.testing.assert_close(actual, expected) -""" + @pytest.mark.parametrize("batch_size", list(range(1, 257))) def test_prepare_decode_cuda_graph(batch_size): model_runner = _create_model_runner( @@ -246,7 +246,7 @@ def test_prepare_decode_cuda_graph(batch_size): device=actual.device, dtype=actual.dtype) torch.testing.assert_close(actual, expected) -""" + def test_empty_seq_group(): """Verify prepare prompt and decode returns empty output.""" diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 724257e0a884f..d941ffdb5588a 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -5,7 +5,7 @@ from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.worker import Worker -""" + def test_swap() -> None: # Configure the engine. engine_args = EngineArgs(model="facebook/opt-125m", @@ -87,4 +87,3 @@ def test_swap() -> None: for src, dst in execute_model_req.blocks_to_swap_in: assert allclose(gpu_key_cache[dst], cpu_key_cache[src]) assert allclose(gpu_value_cache[dst], cpu_value_cache[src]) -"""