hack 2

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2026-06-26 14:07:16 +08:00 · 2025-07-10 18:56:08 -04:00 · 2025-07-10 18:56:08 -04:00 · 37cf1f27f2
commit 37cf1f27f2
parent 45ea3c31a2
1 changed files with 2 additions and 3 deletions
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -1115,13 +1115,12 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
    MoE all2all (DeepEP) usually allocate the communication buffer
    based on the model shape for optimal performance.
    """
-    gpus = list(range(torch.cuda.device_count()))
    orig = torch.cuda.current_device()
-    for d in gpus:
+    for d in range(8):
        torch.cuda.set_device(d)
        torch.zeros(1, device=f'cuda:{d}')
    torch.cuda.set_device(orig)
-    print("pre-warmed all GPUs:", gpus)
+    print("pre-warmed all GPUs")
    if _TP is not None:
        _TP.prepare_communication_buffer_for_model(model)
    if _PP is not None: