mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-05 06:57:27 +08:00
[Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#22275)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
This commit is contained in:
parent
7e6544c797
commit
302962e806
@ -297,10 +297,10 @@ class CoreEngineActorManager:
|
||||
local_engine_count = \
|
||||
vllm_config.parallel_config.data_parallel_size_local
|
||||
|
||||
nodes = sorted(list_nodes(),
|
||||
nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]),
|
||||
key=lambda node: node.node_ip != dp_master_ip)
|
||||
assert nodes[0].node_ip == dp_master_ip, (
|
||||
"The first node must be the head node")
|
||||
"The head node is missing or dead")
|
||||
assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
|
||||
"There can only be one head node")
|
||||
|
||||
@ -312,6 +312,8 @@ class CoreEngineActorManager:
|
||||
for node in nodes:
|
||||
node_ip = node.node_ip
|
||||
node_resources = available_resources[node.node_id]
|
||||
if "GPU" not in node_resources:
|
||||
continue
|
||||
# For now, each DP rank can only be assigned to one node
|
||||
# TODO(rui): support allocating a single DP rank
|
||||
# to multiple nodes
|
||||
@ -346,6 +348,13 @@ class CoreEngineActorManager:
|
||||
)
|
||||
placement_groups.append(pg)
|
||||
local_dp_ranks.append(i)
|
||||
if len(placement_groups) < num_pg_to_create:
|
||||
raise ValueError(
|
||||
f"Not enough resources to allocate {num_pg_to_create} "
|
||||
"placement groups, only created "
|
||||
f"{len(placement_groups)} placement groups. "
|
||||
"Available resources: "
|
||||
f"{available_resources}")
|
||||
return placement_groups, local_dp_ranks
|
||||
|
||||
@staticmethod
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user