diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d1455e84aeccb..d78bcede2c407 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1115,13 +1115,12 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module): MoE all2all (DeepEP) usually allocate the communication buffer based on the model shape for optimal performance. """ - gpus = list(range(torch.cuda.device_count())) orig = torch.cuda.current_device() - for d in gpus: + for d in range(8): torch.cuda.set_device(d) torch.zeros(1, device=f'cuda:{d}') torch.cuda.set_device(orig) - print("pre-warmed all GPUs:", gpus) + print("pre-warmed all GPUs") if _TP is not None: _TP.prepare_communication_buffer_for_model(model) if _PP is not None: