Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Tyler Michael Smith 2025-07-10 18:35:43 -04:00
parent df866cfebf
commit 45ea3c31a2

View File

@ -1115,7 +1115,13 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
MoE all2all (DeepEP) usually allocate the communication buffer MoE all2all (DeepEP) usually allocate the communication buffer
based on the model shape for optimal performance. based on the model shape for optimal performance.
""" """
get_world_group().barrier() gpus = list(range(torch.cuda.device_count()))
orig = torch.cuda.current_device()
for d in gpus:
torch.cuda.set_device(d)
torch.zeros(1, device=f'cuda:{d}')
torch.cuda.set_device(orig)
print("pre-warmed all GPUs:", gpus)
if _TP is not None: if _TP is not None:
_TP.prepare_communication_buffer_for_model(model) _TP.prepare_communication_buffer_for_model(model)
if _PP is not None: if _PP is not None: