From 0860087affa8eb0ba0490e02bb965d3aec421952 Mon Sep 17 00:00:00 2001 From: Conroy Cheers Date: Tue, 17 Jun 2025 10:42:14 +1000 Subject: [PATCH] [Fix] Fall back to Gloo when NCCL backend is unavailable (#19641) Signed-off-by: conroy-cheers --- vllm/distributed/parallel_state.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 10f87c49baa9..126160b09553 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -938,6 +938,13 @@ def init_distributed_environment( assert distributed_init_method is not None, ( "distributed_init_method must be provided when initializing " "distributed environment") + if not torch.distributed.is_backend_available(backend): + logger.warning( + "Distributed backend %s is not available; " + "falling back to gloo.", backend) + assert torch.distributed.is_gloo_available(), ( + "Fallback Gloo backend is not available.") + backend = "gloo" # this backend is used for WORLD torch.distributed.init_process_group( backend=backend,