From 302962e806e9820643ae25987e8e38ed035e05d3 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Tue, 5 Aug 2025 20:35:32 -0700 Subject: [PATCH] [Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#22275) Signed-off-by: Rui Qiao --- vllm/v1/engine/utils.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index f39aa40593261..770aa7d9dcc8a 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -297,10 +297,10 @@ class CoreEngineActorManager: local_engine_count = \ vllm_config.parallel_config.data_parallel_size_local - nodes = sorted(list_nodes(), + nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]), key=lambda node: node.node_ip != dp_master_ip) assert nodes[0].node_ip == dp_master_ip, ( - "The first node must be the head node") + "The head node is missing or dead") assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, ( "There can only be one head node") @@ -312,6 +312,8 @@ class CoreEngineActorManager: for node in nodes: node_ip = node.node_ip node_resources = available_resources[node.node_id] + if "GPU" not in node_resources: + continue # For now, each DP rank can only be assigned to one node # TODO(rui): support allocating a single DP rank # to multiple nodes @@ -346,6 +348,13 @@ class CoreEngineActorManager: ) placement_groups.append(pg) local_dp_ranks.append(i) + if len(placement_groups) < num_pg_to_create: + raise ValueError( + f"Not enough resources to allocate {num_pg_to_create} " + "placement groups, only created " + f"{len(placement_groups)} placement groups. " + "Available resources: " + f"{available_resources}") return placement_groups, local_dp_ranks @staticmethod