From 37cf1f27f20dcd380281c26fe715ede5f5937ea8 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 10 Jul 2025 18:56:08 -0400 Subject: [PATCH] hack 2 Signed-off-by: Tyler Michael Smith --- vllm/distributed/parallel_state.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d1455e84aeccb..d78bcede2c407 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1115,13 +1115,12 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module): MoE all2all (DeepEP) usually allocate the communication buffer based on the model shape for optimal performance. """ - gpus = list(range(torch.cuda.device_count())) orig = torch.cuda.current_device() - for d in gpus: + for d in range(8): torch.cuda.set_device(d) torch.zeros(1, device=f'cuda:{d}') torch.cuda.set_device(orig) - print("pre-warmed all GPUs:", gpus) + print("pre-warmed all GPUs") if _TP is not None: _TP.prepare_communication_buffer_for_model(model) if _PP is not None: