From 187bd5f7f33b183a413b37533ac2719388df58a2 Mon Sep 17 00:00:00 2001 From: yurekami Date: Thu, 25 Dec 2025 01:20:43 +0900 Subject: [PATCH 1/4] fix(ray): correct misleading warning message for multi-node clusters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #31005 The warning message incorrectly stated "Tensor parallel size" when it was actually comparing the world_size (TP × PP) against locally visible GPUs. This was confusing for multi-node Ray deployments where: 1. The world_size includes both tensor and pipeline parallelism 2. The "available GPUs" count only reflects the local node, not the full Ray cluster Changes: - Replaced "Tensor parallel size" with "World size" - Clarified that the GPU count is for locally visible devices - Added context about multi-node Ray clusters - Included the actual TP and PP values in the message for clarity Signed-off-by: yurekami 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- vllm/v1/executor/ray_utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index 21910d1160bd4..2936855c84db5 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -329,14 +329,17 @@ def initialize_ray_cluster( available_gpus = cuda_device_count_stateless() if parallel_config.world_size > available_gpus: logger.warning( - "Tensor parallel size (%d) exceeds available GPUs (%d). " - "This may result in Ray placement group allocation failures. " - "Consider reducing tensor_parallel_size to %d or less, " - "or ensure your Ray cluster has %d GPUs available.", + "World size (%d) exceeds locally visible GPUs (%d). " + "For single-node deployments, this may result in Ray " + "placement group allocation failures. For multi-node Ray " + "clusters, ensure your cluster has %d GPUs available across " + "all nodes. (world_size = tensor_parallel_size=%d × " + "pipeline_parallel_size=%d)", parallel_config.world_size, available_gpus, - available_gpus, parallel_config.world_size, + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size, ) if ray.is_initialized(): From 9380f13f5b7d9823ac037e8114461b8ed61c5c96 Mon Sep 17 00:00:00 2001 From: yurekami Date: Thu, 25 Dec 2025 01:20:43 +0900 Subject: [PATCH 2/4] fix(ray): correct misleading warning message for multi-node clusters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #31005 The warning message incorrectly stated "Tensor parallel size" when it was actually comparing the world_size (TP × PP) against locally visible GPUs. This was confusing for multi-node Ray deployments where: 1. The world_size includes both tensor and pipeline parallelism 2. The "available GPUs" count only reflects the local node, not the full Ray cluster Changes: - Replaced "Tensor parallel size" with "World size" - Clarified that the GPU count is for locally visible devices - Added context about multi-node Ray clusters - Included the actual TP and PP values in the message for clarity Signed-off-by: yurekami 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- vllm/config/model.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index a730aa8ad1b9c..8b26148ae36a0 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings -from collections.abc import Callable +from collections.abc import Callable, Iterator from dataclasses import InitVar, field from functools import cached_property from typing import TYPE_CHECKING, Any, Literal, cast, get_args @@ -1806,7 +1806,7 @@ class ModelConfig: return getattr(self.hf_config, "quantization_config", None) is not None -def get_served_model_name(model: str, served_model_name: str | list[str] | None): +def get_served_model_name(model: str, served_model_name: str | list[str] | None) -> str: """ If the input is a non-empty list, the first model_name in `served_model_name` is taken. @@ -1844,7 +1844,9 @@ _SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [ ] -def iter_architecture_defaults(): +def iter_architecture_defaults() -> Iterator[ + tuple[str, tuple[RunnerType, ConvertType]] +]: yield from _SUFFIX_TO_DEFAULTS @@ -1877,7 +1879,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = { } -def str_dtype_to_torch_dtype(type: str): +def str_dtype_to_torch_dtype(type: str) -> torch.dtype | None: return _STR_DTYPE_TO_TORCH_DTYPE.get(type) @@ -1891,14 +1893,14 @@ _FLOAT16_NOT_SUPPORTED_MODELS = { } -def _is_valid_dtype(model_type: str, dtype: torch.dtype): +def _is_valid_dtype(model_type: str, dtype: torch.dtype) -> bool: if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 return False return True -def _check_valid_dtype(model_type: str, dtype: torch.dtype): +def _check_valid_dtype(model_type: str, dtype: torch.dtype) -> bool: if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] raise ValueError( @@ -1913,7 +1915,7 @@ def _find_dtype( config: PretrainedConfig, *, revision: str | None, -): +) -> torch.dtype: # NOTE: getattr(config, "dtype", torch.float32) is not correct # because config.dtype can be None. config_dtype = getattr(config, "dtype", None) @@ -1953,7 +1955,7 @@ def _resolve_auto_dtype( config_dtype: torch.dtype, *, is_pooling_model: bool, -): +) -> torch.dtype: from vllm.platforms import current_platform supported_dtypes = [ From 0ec051a4c8bf7ee603ce198b63b63aa73ecfa5d8 Mon Sep 17 00:00:00 2001 From: yurekami Date: Thu, 25 Dec 2025 03:26:53 +0900 Subject: [PATCH 3/4] fix: include prefill_context_parallel_size in world_size breakdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback: the world_size calculation includes prefill_context_parallel_size, so include it in the breakdown when it's > 1 to make the message fully accurate. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 Signed-off-by: yurekami --- vllm/v1/executor/ray_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index 2936855c84db5..ec0884c014f38 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -334,12 +334,15 @@ def initialize_ray_cluster( "placement group allocation failures. For multi-node Ray " "clusters, ensure your cluster has %d GPUs available across " "all nodes. (world_size = tensor_parallel_size=%d × " - "pipeline_parallel_size=%d)", + "pipeline_parallel_size=%d%s)", parallel_config.world_size, available_gpus, parallel_config.world_size, parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, + (f" × prefill_context_parallel_size=" + f"{parallel_config.prefill_context_parallel_size}" + if parallel_config.prefill_context_parallel_size > 1 else ""), ) if ray.is_initialized(): From 74b3a2014a3cd207d526917438ae28fe9bcdccfa Mon Sep 17 00:00:00 2001 From: yurekami Date: Thu, 25 Dec 2025 03:34:46 +0900 Subject: [PATCH 4/4] style: fix formatting for pre-commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 Signed-off-by: yurekami --- vllm/v1/executor/ray_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index ec0884c014f38..29cf0fc4958ab 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -340,9 +340,12 @@ def initialize_ray_cluster( parallel_config.world_size, parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, - (f" × prefill_context_parallel_size=" - f"{parallel_config.prefill_context_parallel_size}" - if parallel_config.prefill_context_parallel_size > 1 else ""), + ( + f" × prefill_context_parallel_size=" + f"{parallel_config.prefill_context_parallel_size}" + if parallel_config.prefill_context_parallel_size > 1 + else "" + ), ) if ray.is_initialized():