[Fix] Fall back to Gloo when NCCL backend is unavailable (#19641)

Signed-off-by: conroy-cheers <conroy@corncheese.org>
This commit is contained in:
Conroy Cheers 2025-06-17 10:42:14 +10:00 committed by GitHub
parent 6bc7b57315
commit 0860087aff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -938,6 +938,13 @@ def init_distributed_environment(
assert distributed_init_method is not None, (
"distributed_init_method must be provided when initializing "
"distributed environment")
if not torch.distributed.is_backend_available(backend):
logger.warning(
"Distributed backend %s is not available; "
"falling back to gloo.", backend)
assert torch.distributed.is_gloo_available(), (
"Fallback Gloo backend is not available.")
backend = "gloo"
# this backend is used for WORLD
torch.distributed.init_process_group(
backend=backend,