diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index f39aa40593261..770aa7d9dcc8a 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -297,10 +297,10 @@ class CoreEngineActorManager: local_engine_count = \ vllm_config.parallel_config.data_parallel_size_local - nodes = sorted(list_nodes(), + nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]), key=lambda node: node.node_ip != dp_master_ip) assert nodes[0].node_ip == dp_master_ip, ( - "The first node must be the head node") + "The head node is missing or dead") assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, ( "There can only be one head node") @@ -312,6 +312,8 @@ class CoreEngineActorManager: for node in nodes: node_ip = node.node_ip node_resources = available_resources[node.node_id] + if "GPU" not in node_resources: + continue # For now, each DP rank can only be assigned to one node # TODO(rui): support allocating a single DP rank # to multiple nodes @@ -346,6 +348,13 @@ class CoreEngineActorManager: ) placement_groups.append(pg) local_dp_ranks.append(i) + if len(placement_groups) < num_pg_to_create: + raise ValueError( + f"Not enough resources to allocate {num_pg_to_create} " + "placement groups, only created " + f"{len(placement_groups)} placement groups. " + "Available resources: " + f"{available_resources}") return placement_groups, local_dp_ranks @staticmethod