From 187bd5f7f33b183a413b37533ac2719388df58a2 Mon Sep 17 00:00:00 2001
From: yurekami <yurekami@users.noreply.github.com>
Date: Thu, 25 Dec 2025 01:20:43 +0900
Subject: [PATCH 1/4] fix(ray): correct misleading warning message for
 multi-node clusters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #31005

The warning message incorrectly stated "Tensor parallel size" when it
was actually comparing the world_size (TP × PP) against locally visible
GPUs. This was confusing for multi-node Ray deployments where:

1. The world_size includes both tensor and pipeline parallelism
2. The "available GPUs" count only reflects the local node, not the
   full Ray cluster

Changes:
- Replaced "Tensor parallel size" with "World size"
- Clarified that the GPU count is for locally visible devices
- Added context about multi-node Ray clusters
- Included the actual TP and PP values in the message for clarity

Signed-off-by: yurekami <yurekami@users.noreply.github.com>

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 vllm/v1/executor/ray_utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 21910d1160bd4..2936855c84db5 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -329,14 +329,17 @@ def initialize_ray_cluster(
         available_gpus = cuda_device_count_stateless()
         if parallel_config.world_size > available_gpus:
             logger.warning(
-                "Tensor parallel size (%d) exceeds available GPUs (%d). "
-                "This may result in Ray placement group allocation failures. "
-                "Consider reducing tensor_parallel_size to %d or less, "
-                "or ensure your Ray cluster has %d GPUs available.",
+                "World size (%d) exceeds locally visible GPUs (%d). "
+                "For single-node deployments, this may result in Ray "
+                "placement group allocation failures. For multi-node Ray "
+                "clusters, ensure your cluster has %d GPUs available across "
+                "all nodes. (world_size = tensor_parallel_size=%d × "
+                "pipeline_parallel_size=%d)",
                 parallel_config.world_size,
                 available_gpus,
-                available_gpus,
                 parallel_config.world_size,
+                parallel_config.tensor_parallel_size,
+                parallel_config.pipeline_parallel_size,
             )
 
     if ray.is_initialized():

From 9380f13f5b7d9823ac037e8114461b8ed61c5c96 Mon Sep 17 00:00:00 2001
From: yurekami <yurekami@users.noreply.github.com>
Date: Thu, 25 Dec 2025 01:20:43 +0900
Subject: [PATCH 2/4] fix(ray): correct misleading warning message for
 multi-node clusters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #31005

The warning message incorrectly stated "Tensor parallel size" when it
was actually comparing the world_size (TP × PP) against locally visible
GPUs. This was confusing for multi-node Ray deployments where:

1. The world_size includes both tensor and pipeline parallelism
2. The "available GPUs" count only reflects the local node, not the
   full Ray cluster

Changes:
- Replaced "Tensor parallel size" with "World size"
- Clarified that the GPU count is for locally visible devices
- Added context about multi-node Ray clusters
- Included the actual TP and PP values in the message for clarity

Signed-off-by: yurekami <yurekami@users.noreply.github.com>

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 vllm/config/model.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index a730aa8ad1b9c..8b26148ae36a0 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from collections.abc import Callable
+from collections.abc import Callable, Iterator
 from dataclasses import InitVar, field
 from functools import cached_property
 from typing import TYPE_CHECKING, Any, Literal, cast, get_args
@@ -1806,7 +1806,7 @@ class ModelConfig:
         return getattr(self.hf_config, "quantization_config", None) is not None
 
 
-def get_served_model_name(model: str, served_model_name: str | list[str] | None):
+def get_served_model_name(model: str, served_model_name: str | list[str] | None) -> str:
     """
     If the input is a non-empty list, the first model_name in
     `served_model_name` is taken.
@@ -1844,7 +1844,9 @@ _SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
 ]
 
 
-def iter_architecture_defaults():
+def iter_architecture_defaults() -> Iterator[
+    tuple[str, tuple[RunnerType, ConvertType]]
+]:
     yield from _SUFFIX_TO_DEFAULTS
 
 
@@ -1877,7 +1879,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
 }
 
 
-def str_dtype_to_torch_dtype(type: str):
+def str_dtype_to_torch_dtype(type: str) -> torch.dtype | None:
     return _STR_DTYPE_TO_TORCH_DTYPE.get(type)
 
 
@@ -1891,14 +1893,14 @@ _FLOAT16_NOT_SUPPORTED_MODELS = {
 }
 
 
-def _is_valid_dtype(model_type: str, dtype: torch.dtype):
+def _is_valid_dtype(model_type: str, dtype: torch.dtype) -> bool:
     if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:  # noqa: E501, SIM103
         return False
 
     return True
 
 
-def _check_valid_dtype(model_type: str, dtype: torch.dtype):
+def _check_valid_dtype(model_type: str, dtype: torch.dtype) -> bool:
     if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16:
         reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type]
         raise ValueError(
@@ -1913,7 +1915,7 @@ def _find_dtype(
     config: PretrainedConfig,
     *,
     revision: str | None,
-):
+) -> torch.dtype:
     # NOTE: getattr(config, "dtype", torch.float32) is not correct
     # because config.dtype can be None.
     config_dtype = getattr(config, "dtype", None)
@@ -1953,7 +1955,7 @@ def _resolve_auto_dtype(
     config_dtype: torch.dtype,
     *,
     is_pooling_model: bool,
-):
+) -> torch.dtype:
     from vllm.platforms import current_platform
 
     supported_dtypes = [

From 0ec051a4c8bf7ee603ce198b63b63aa73ecfa5d8 Mon Sep 17 00:00:00 2001
From: yurekami <yurekami@users.noreply.github.com>
Date: Thu, 25 Dec 2025 03:26:53 +0900
Subject: [PATCH 3/4] fix: include prefill_context_parallel_size in world_size
 breakdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback: the world_size calculation includes
prefill_context_parallel_size, so include it in the breakdown
when it's > 1 to make the message fully accurate.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: yurekami <yurekami@users.noreply.github.com>
---
 vllm/v1/executor/ray_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index 2936855c84db5..ec0884c014f38 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -334,12 +334,15 @@ def initialize_ray_cluster(
                 "placement group allocation failures. For multi-node Ray "
                 "clusters, ensure your cluster has %d GPUs available across "
                 "all nodes. (world_size = tensor_parallel_size=%d × "
-                "pipeline_parallel_size=%d)",
+                "pipeline_parallel_size=%d%s)",
                 parallel_config.world_size,
                 available_gpus,
                 parallel_config.world_size,
                 parallel_config.tensor_parallel_size,
                 parallel_config.pipeline_parallel_size,
+                (f" × prefill_context_parallel_size="
+                 f"{parallel_config.prefill_context_parallel_size}"
+                 if parallel_config.prefill_context_parallel_size > 1 else ""),
             )
 
     if ray.is_initialized():

From 74b3a2014a3cd207d526917438ae28fe9bcdccfa Mon Sep 17 00:00:00 2001
From: yurekami <yurekami@users.noreply.github.com>
Date: Thu, 25 Dec 2025 03:34:46 +0900
Subject: [PATCH 4/4] style: fix formatting for pre-commit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: yurekami <yurekami@users.noreply.github.com>
---
 vllm/v1/executor/ray_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py
index ec0884c014f38..29cf0fc4958ab 100644
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -340,9 +340,12 @@ def initialize_ray_cluster(
                 parallel_config.world_size,
                 parallel_config.tensor_parallel_size,
                 parallel_config.pipeline_parallel_size,
-                (f" × prefill_context_parallel_size="
-                 f"{parallel_config.prefill_context_parallel_size}"
-                 if parallel_config.prefill_context_parallel_size > 1 else ""),
+                (
+                    f" × prefill_context_parallel_size="
+                    f"{parallel_config.prefill_context_parallel_size}"
+                    if parallel_config.prefill_context_parallel_size > 1
+                    else ""
+                ),
             )
 
     if ray.is_initialized():