[Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#22275)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
This commit is contained in:
Rui Qiao 2025-08-05 20:35:32 -07:00 committed by GitHub
parent 7e6544c797
commit 302962e806
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -297,10 +297,10 @@ class CoreEngineActorManager:
local_engine_count = \
vllm_config.parallel_config.data_parallel_size_local
nodes = sorted(list_nodes(),
nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]),
key=lambda node: node.node_ip != dp_master_ip)
assert nodes[0].node_ip == dp_master_ip, (
"The first node must be the head node")
"The head node is missing or dead")
assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
"There can only be one head node")
@ -312,6 +312,8 @@ class CoreEngineActorManager:
for node in nodes:
node_ip = node.node_ip
node_resources = available_resources[node.node_id]
if "GPU" not in node_resources:
continue
# For now, each DP rank can only be assigned to one node
# TODO(rui): support allocating a single DP rank
# to multiple nodes
@ -346,6 +348,13 @@ class CoreEngineActorManager:
)
placement_groups.append(pg)
local_dp_ranks.append(i)
if len(placement_groups) < num_pg_to_create:
raise ValueError(
f"Not enough resources to allocate {num_pg_to_create} "
"placement groups, only created "
f"{len(placement_groups)} placement groups. "
"Available resources: "
f"{available_resources}")
return placement_groups, local_dp_ranks
@staticmethod