[Bugfix] Improve GPU validation logging in Ray fallback scenarios (#25775)

Signed-off-by: Sairam Pillai <sairam.pillai61@gmail.com>
2026-01-01 21:05:30 +08:00 · 2025-10-30 17:27:59 +05:30 · 2025-10-30 17:27:59 +05:30 · 74374386e2
commit 74374386e2
parent c01f6e525f
2 changed files with 49 additions and 15 deletions
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -521,15 +521,11 @@ class ParallelConfig:
                current_platform.is_cuda()
                and cuda_device_count_stateless() < self.world_size
            ):
-                if not ray_found:
-                    raise ValueError(
-                        "Unable to load Ray: "
-                        f"{ray_utils.ray_import_err}. Ray is "
-                        "required for multi-node inference, "
-                        "please install Ray with `pip install "
-                        "ray`."
-                    )
-                backend = "ray"
+                gpu_count = cuda_device_count_stateless()
+                raise ValueError(
+                    f"Tensor parallel size ({self.world_size}) cannot be "
+                    f"larger than the number of available GPUs ({gpu_count})."
+                )
            elif self.data_parallel_backend == "ray":
                logger.info(
                    "Using ray distributed inference because "
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
    try:
        ray.get(pg_ready_ref, timeout=0)
    except ray.exceptions.GetTimeoutError:
-        raise ValueError(
-            "Cannot provide a placement group of "
-            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
-            "`ray status` and `ray list nodes` to make sure the cluster has "
-            "enough resources."
-        ) from None
+        # Provide more helpful error message when GPU count is exceeded
+        total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs)
+        # If more than one GPU is required for the placement group, provide a
+        # more specific error message.
+        # We use >1 here because multi-GPU (tensor parallel) jobs are more
+        # likely to fail due to insufficient cluster resources, and users may
+        # need to adjust tensor_parallel_size to fit available GPUs.
+        if total_gpu_required > 1:
+            raise ValueError(
+                f"Cannot provide a placement group requiring "
+                f"{total_gpu_required} GPUs "
+                f"(placement_group_specs={placement_group_specs}) within "
+                f"{PG_WAIT_TIMEOUT} seconds.\n"
+                f"Tensor parallel size may exceed available GPUs in your "
+                f"cluster. Check resources with `ray status` and "
+                f"`ray list nodes`.\n"
+                f"If running on K8s with limited GPUs, consider reducing "
+                f"--tensor-parallel-size to match available GPU resources."
+            ) from None
+        else:
+            raise ValueError(
+                "Cannot provide a placement group of "
+                f"{placement_group_specs=} within "
+                f"{PG_WAIT_TIMEOUT} seconds. See "
+                "`ray status` and `ray list nodes` to make sure the cluster "
+                "has enough resources."
+            ) from None


 def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
@ -299,6 +320,23 @@ def initialize_ray_cluster(
    assert_ray_available()
    from vllm.platforms import current_platform

+    # Prevalidate GPU requirements before Ray processing
+    if current_platform.is_cuda() and parallel_config.world_size > 1:
+        from vllm.utils import cuda_device_count_stateless
+
+        available_gpus = cuda_device_count_stateless()
+        if parallel_config.world_size > available_gpus:
+            logger.warning(
+                "Tensor parallel size (%d) exceeds available GPUs (%d). "
+                "This may result in Ray placement group allocation failures. "
+                "Consider reducing tensor_parallel_size to %d or less, "
+                "or ensure your Ray cluster has %d GPUs available.",
+                parallel_config.world_size,
+                available_gpus,
+                available_gpus,
+                parallel_config.world_size,
+            )
+
    if ray.is_initialized():
        logger.info("Ray is already initialized. Skipping Ray initialization.")
    elif current_platform.is_rocm() or current_platform.is_xpu():