diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index d7bb4f3aeef15..996b8147f2041 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -782,7 +782,6 @@ class GroupCoordinator:
 
     def prepare_communication_buffer_for_model(self, model: torch.nn.Module):
         if self.device_communicator is not None:
-            torch.distributed.barrier(self.device_communicator)
             self.device_communicator.prepare_communication_buffer_for_model(
                 model)
 
@@ -1116,6 +1115,7 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
     MoE all2all (DeepEP) usually allocate the communication buffer
     based on the model shape for optimal performance.
     """
+    get_world_group().barrier()
     if _TP is not None:
         _TP.prepare_communication_buffer_for_model(model)
     if _PP is not None: