From 37cf1f27f20dcd380281c26fe715ede5f5937ea8 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 10 Jul 2025 18:56:08 -0400
Subject: [PATCH] hack 2

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/distributed/parallel_state.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index d1455e84aeccb..d78bcede2c407 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1115,13 +1115,12 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
     MoE all2all (DeepEP) usually allocate the communication buffer
     based on the model shape for optimal performance.
     """
-    gpus = list(range(torch.cuda.device_count()))
     orig = torch.cuda.current_device()
-    for d in gpus:
+    for d in range(8):
         torch.cuda.set_device(d)
         torch.zeros(1, device=f'cuda:{d}')
     torch.cuda.set_device(orig)
-    print("pre-warmed all GPUs:", gpus)
+    print("pre-warmed all GPUs")
     if _TP is not None:
         _TP.prepare_communication_buffer_for_model(model)
     if _PP is not None: