Add gpu memory wait before test_async_tp (#28893)

Signed-off-by: angelayi <yiangela7@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-05-23 00:57:52 +08:00 · 2025-11-28 20:19:05 -08:00 · 2025-11-28 20:19:05 -08:00 · 4b17ce6815
commit 4b17ce6815
parent e23f665d83
2 changed files with 32 additions and 3 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -1313,11 +1313,11 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1424,3 +1424,32 @@ def disable_deepgemm_ue8m0(monkeypatch):
        # Clear cache so the next time it is used it is processed with the
        # default VLLM_USE_DEEP_GEMM_E8M0  setting.
        is_deep_gemm_e8m0_used.cache_clear()
+
+
+@pytest.fixture(autouse=True)
+def clean_gpu_memory_between_tests():
+    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
+        yield
+        return
+
+    # Wait for GPU memory to be cleared before starting the test
+    import gc
+
+    from tests.utils import wait_for_gpu_memory_to_clear
+
+    num_gpus = torch.cuda.device_count()
+    if num_gpus > 0:
+        try:
+            wait_for_gpu_memory_to_clear(
+                devices=list(range(num_gpus)),
+                threshold_ratio=0.1,
+            )
+        except ValueError as e:
+            logger.info("Failed to clean GPU memory: %s", e)
+
+    yield
+
+    # Clean up GPU memory after the test
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()