mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 08:55:02 +08:00
Add gpu memory wait before test_async_tp (#28893)
Signed-off-by: angelayi <yiangela7@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
parent
e23f665d83
commit
4b17ce6815
@ -1313,11 +1313,11 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/compile/distributed/test_async_tp.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
||||||
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
||||||
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
||||||
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|||||||
@ -1424,3 +1424,32 @@ def disable_deepgemm_ue8m0(monkeypatch):
|
|||||||
# Clear cache so the next time it is used it is processed with the
|
# Clear cache so the next time it is used it is processed with the
|
||||||
# default VLLM_USE_DEEP_GEMM_E8M0 setting.
|
# default VLLM_USE_DEEP_GEMM_E8M0 setting.
|
||||||
is_deep_gemm_e8m0_used.cache_clear()
|
is_deep_gemm_e8m0_used.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def clean_gpu_memory_between_tests():
|
||||||
|
if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
# Wait for GPU memory to be cleared before starting the test
|
||||||
|
import gc
|
||||||
|
|
||||||
|
from tests.utils import wait_for_gpu_memory_to_clear
|
||||||
|
|
||||||
|
num_gpus = torch.cuda.device_count()
|
||||||
|
if num_gpus > 0:
|
||||||
|
try:
|
||||||
|
wait_for_gpu_memory_to_clear(
|
||||||
|
devices=list(range(num_gpus)),
|
||||||
|
threshold_ratio=0.1,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
logger.info("Failed to clean GPU memory: %s", e)
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Clean up GPU memory after the test
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
gc.collect()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user