[Bugfix] Improve GPU validation logging in Ray fallback scenarios (#25775)

Signed-off-by: Sairam Pillai <sairam.pillai61@gmail.com>
This commit is contained in:
Sairam Pillai 2025-10-30 17:27:59 +05:30 committed by GitHub
parent c01f6e525f
commit 74374386e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 49 additions and 15 deletions

View File

@ -521,15 +521,11 @@ class ParallelConfig:
current_platform.is_cuda()
and cuda_device_count_stateless() < self.world_size
):
if not ray_found:
raise ValueError(
"Unable to load Ray: "
f"{ray_utils.ray_import_err}. Ray is "
"required for multi-node inference, "
"please install Ray with `pip install "
"ray`."
)
backend = "ray"
gpu_count = cuda_device_count_stateless()
raise ValueError(
f"Tensor parallel size ({self.world_size}) cannot be "
f"larger than the number of available GPUs ({gpu_count})."
)
elif self.data_parallel_backend == "ray":
logger.info(
"Using ray distributed inference because "

View File

@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
try:
ray.get(pg_ready_ref, timeout=0)
except ray.exceptions.GetTimeoutError:
raise ValueError(
"Cannot provide a placement group of "
f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
"`ray status` and `ray list nodes` to make sure the cluster has "
"enough resources."
) from None
# Provide more helpful error message when GPU count is exceeded
total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs)
# If more than one GPU is required for the placement group, provide a
# more specific error message.
# We use >1 here because multi-GPU (tensor parallel) jobs are more
# likely to fail due to insufficient cluster resources, and users may
# need to adjust tensor_parallel_size to fit available GPUs.
if total_gpu_required > 1:
raise ValueError(
f"Cannot provide a placement group requiring "
f"{total_gpu_required} GPUs "
f"(placement_group_specs={placement_group_specs}) within "
f"{PG_WAIT_TIMEOUT} seconds.\n"
f"Tensor parallel size may exceed available GPUs in your "
f"cluster. Check resources with `ray status` and "
f"`ray list nodes`.\n"
f"If running on K8s with limited GPUs, consider reducing "
f"--tensor-parallel-size to match available GPU resources."
) from None
else:
raise ValueError(
"Cannot provide a placement group of "
f"{placement_group_specs=} within "
f"{PG_WAIT_TIMEOUT} seconds. See "
"`ray status` and `ray list nodes` to make sure the cluster "
"has enough resources."
) from None
def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
@ -299,6 +320,23 @@ def initialize_ray_cluster(
assert_ray_available()
from vllm.platforms import current_platform
# Prevalidate GPU requirements before Ray processing
if current_platform.is_cuda() and parallel_config.world_size > 1:
from vllm.utils import cuda_device_count_stateless
available_gpus = cuda_device_count_stateless()
if parallel_config.world_size > available_gpus:
logger.warning(
"Tensor parallel size (%d) exceeds available GPUs (%d). "
"This may result in Ray placement group allocation failures. "
"Consider reducing tensor_parallel_size to %d or less, "
"or ensure your Ray cluster has %d GPUs available.",
parallel_config.world_size,
available_gpus,
available_gpus,
parallel_config.world_size,
)
if ray.is_initialized():
logger.info("Ray is already initialized. Skipping Ray initialization.")
elif current_platform.is_rocm() or current_platform.is_xpu():