[CI/Test] improve robustness of test (vllm_runner) (vllm-project#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (vllm-project#5357)
joerunde · Jun 13, 2024 · c7ec7c8 · c7ec7c8
1 parent 56239b9
commit c7ec7c8
Show file tree

Hide file tree

Showing 28 changed files with 455 additions and 494 deletions.
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -46,12 +46,11 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             enforce_eager=enforce_eager,
-                             gpu_memory_utilization=0.7)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enforce_eager=enforce_eager,
+                     gpu_memory_utilization=0.7) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -43,17 +43,16 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_num_batched_tokens=max_num_batched_tokens,
-        enable_chunked_prefill=enable_chunked_prefill,
-        tensor_parallel_size=tensor_parallel_size,
-        enforce_eager=enforce_eager,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]

diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
@@ -46,17 +46,16 @@ def test_chunked_prefill_recompute(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_num_batched_tokens=max_num_batched_tokens,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -84,17 +83,16 @@ def test_preemption(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        disable_log_stats=False,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    total_preemption = (
-        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -139,19 +137,18 @@ def test_swap(
         hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                    max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        swap_space=10,
-        disable_log_stats=False,
-    )
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    total_preemption = (
-        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            swap_space=10,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, _ = hf_outputs[i]
@@ -196,28 +193,28 @@ def test_swap_infeasible(
     decode_blocks = max_tokens // BLOCK_SIZE
     example_prompts = example_prompts[:1]
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        swap_space=10,
-        block_size=BLOCK_SIZE,
-        # Since beam search have more than 1 sequence, prefill + decode blocks
-        # are not enough to finish.
-        num_gpu_blocks_override=prefill_blocks + decode_blocks,
-        max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-    )
-    sampling_params = SamplingParams(n=beam_width,
-                                     use_beam_search=True,
-                                     temperature=0.0,
-                                     max_tokens=max_tokens,
-                                     ignore_eos=True)
-    req_outputs = vllm_model.model.generate(
-        example_prompts,
-        sampling_params=sampling_params,
-    )
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            swap_space=10,
+            block_size=BLOCK_SIZE,
+            # Since beam search have more than 1 sequence, prefill +
+            # decode blocks are not enough to finish.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks,
+            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+    ) as vllm_model:
+        sampling_params = SamplingParams(n=beam_width,
+                                         use_beam_search=True,
+                                         temperature=0.0,
+                                         max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+
     # Verify the request is ignored and not hang.
     assert req_outputs[0].outputs[0].finish_reason == "length"
 
@@ -236,25 +233,26 @@ def test_preemption_infeasible(
     BLOCK_SIZE = 16
     prefill_blocks = 2
     decode_blocks = max_tokens // BLOCK_SIZE
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        block_size=BLOCK_SIZE,
-        # Not enough gpu blocks to complete a single sequence.
-        # preemption should happen, and the sequence should be
-        # ignored instead of hanging forever.
-        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-    )
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
-    req_outputs = vllm_model.model.generate(
-        example_prompts,
-        sampling_params=sampling_params,
-    )
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            block_size=BLOCK_SIZE,
+            # Not enough gpu blocks to complete a single sequence.
+            # preemption should happen, and the sequence should be
+            # ignored instead of hanging forever.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+    ) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
 
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
     # Verify the request is ignored and not hang.
     for req_output in req_outputs:
         outputs = req_output.outputs

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -493,7 +493,10 @@ def encode(self, prompts: List[str]) -> List[List[float]]:
             outputs.append(embedding)
         return outputs
 
-    def __del__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
         del self.model
         cleanup()
 

diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
@@ -45,14 +45,13 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        tensor_parallel_size=2,
-        enforce_eager=enforce_eager,
-        distributed_executor_backend=distributed_executor_backend)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     enforce_eager=enforce_eager,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]

diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
@@ -48,17 +48,16 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        tensor_parallel_size=2,
-        max_num_seqs=max_num_seqs,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]

diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
@@ -19,9 +19,8 @@
 
 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL)
-    yield vllm_model
-    del vllm_model
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
 
 
 def test_stop_reason(vllm_model, example_prompts):

diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
@@ -10,7 +10,8 @@
 
 @pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
-    return vllm_runner(MODEL)
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
 
 
 @pytest.mark.skip_global_cleanup