[BugFix] Fix torch distributed stateless PG backend init (#14870)

Signed-off-by: Nick Hill <nhill@redhat.com>
2026-07-22 20:47:27 +08:00 · 2025-03-15 20:26:19 -07:00 · 2025-03-15 20:26:19 -07:00 · b82662d952
commit b82662d952
parent 71c1e07107
2 changed files with 8 additions and 3 deletions
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -76,5 +76,10 @@ if __name__ == "__main__":
                             GPUs_per_dp_rank))
        proc.start()
        procs.append(proc)
+    exit_code = 0
    for proc in procs:
        proc.join()
+        if proc.exitcode:
+            exit_code = proc.exitcode
+
+    exit(exit_code)
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group(
    # different systems (e.g. RPC) in case the store is multi-tenant.
    prefix_store = PrefixStore(init_method, store)

-    pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
-
    pg: ProcessGroup = ProcessGroup(
        prefix_store,
        group_rank,
        group_size,
-        pg_options,
    )

    if backend == "gloo":
@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group(
                                         backend_options)
        backend_type = ProcessGroup.BackendType.NCCL
        device = torch.device("cuda")
+    else:
+        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")

+    pg._set_default_backend(backend_type)
    backend_class._set_sequence_number_for_group()

    pg._register_backend(device, backend_type, backend_class)