mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 01:05:28 +08:00
[BugFix] Fix torch distributed stateless PG backend init (#14870)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
parent
71c1e07107
commit
b82662d952
@ -76,5 +76,10 @@ if __name__ == "__main__":
|
|||||||
GPUs_per_dp_rank))
|
GPUs_per_dp_rank))
|
||||||
proc.start()
|
proc.start()
|
||||||
procs.append(proc)
|
procs.append(proc)
|
||||||
|
exit_code = 0
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.join()
|
proc.join()
|
||||||
|
if proc.exitcode:
|
||||||
|
exit_code = proc.exitcode
|
||||||
|
|
||||||
|
exit(exit_code)
|
||||||
|
|||||||
@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group(
|
|||||||
# different systems (e.g. RPC) in case the store is multi-tenant.
|
# different systems (e.g. RPC) in case the store is multi-tenant.
|
||||||
prefix_store = PrefixStore(init_method, store)
|
prefix_store = PrefixStore(init_method, store)
|
||||||
|
|
||||||
pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
|
|
||||||
|
|
||||||
pg: ProcessGroup = ProcessGroup(
|
pg: ProcessGroup = ProcessGroup(
|
||||||
prefix_store,
|
prefix_store,
|
||||||
group_rank,
|
group_rank,
|
||||||
group_size,
|
group_size,
|
||||||
pg_options,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if backend == "gloo":
|
if backend == "gloo":
|
||||||
@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group(
|
|||||||
backend_options)
|
backend_options)
|
||||||
backend_type = ProcessGroup.BackendType.NCCL
|
backend_type = ProcessGroup.BackendType.NCCL
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
|
||||||
|
|
||||||
|
pg._set_default_backend(backend_type)
|
||||||
backend_class._set_sequence_number_for_group()
|
backend_class._set_sequence_number_for_group()
|
||||||
|
|
||||||
pg._register_backend(device, backend_type, backend_class)
|
pg._register_backend(device, backend_type, backend_class)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user