[BugFix] Fix torch distributed stateless PG backend init (#14870)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill 2025-03-15 20:26:19 -07:00 committed by GitHub
parent 71c1e07107
commit b82662d952
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 8 additions and 3 deletions

View File

@ -76,5 +76,10 @@ if __name__ == "__main__":
GPUs_per_dp_rank))
proc.start()
procs.append(proc)
exit_code = 0
for proc in procs:
proc.join()
if proc.exitcode:
exit_code = proc.exitcode
exit(exit_code)

View File

@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group(
# different systems (e.g. RPC) in case the store is multi-tenant.
prefix_store = PrefixStore(init_method, store)
pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
pg: ProcessGroup = ProcessGroup(
prefix_store,
group_rank,
group_size,
pg_options,
)
if backend == "gloo":
@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group(
backend_options)
backend_type = ProcessGroup.BackendType.NCCL
device = torch.device("cuda")
else:
raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
pg._set_default_backend(backend_type)
backend_class._set_sequence_number_for_group()
pg._register_backend(device, backend_type, backend_class)