diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 996b8147f2041..d1455e84aeccb 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1115,7 +1115,13 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
     MoE all2all (DeepEP) usually allocate the communication buffer
     based on the model shape for optimal performance.
     """
-    get_world_group().barrier()
+    gpus = list(range(torch.cuda.device_count()))
+    orig = torch.cuda.current_device()
+    for d in gpus:
+        torch.cuda.set_device(d)
+        torch.zeros(1, device=f'cuda:{d}')
+    torch.cuda.set_device(orig)
+    print("pre-warmed all GPUs:", gpus)
     if _TP is not None:
         _TP.prepare_communication_buffer_for_model(model)
     if _PP is not None: