From c3e0e9337ef0af04d2d18b263a6a0f7deed75856 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 31 Jul 2025 18:26:11 -0400
Subject: [PATCH 01/54] [Feature] Add Flashinfer MoE Support for Compressed
 Tensor NVFP4 (#21639)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../compressed_tensors_moe.py                 |  53 +++++-
 .../layers/quantization/modelopt.py           | 150 +++--------------
 .../quantization/utils/flashinfer_fp4_moe.py  | 154 ++++++++++++++++++
 .../quantization/utils/nvfp4_moe_support.py   |  59 +++++++
 4 files changed, 287 insertions(+), 129 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 17b41e8a1c23c..09d8890888fa8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -17,9 +17,14 @@ from vllm.model_executor.layers.fused_moe import (
     FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
     FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
     FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa
+    FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    build_flashinfer_fp4_cutlass_moe_kernel,
+    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer, marlin_make_workspace_new,
     marlin_moe_permute_scales)
@@ -28,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_moe_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    cutlass_fp4_supported, swizzle_blockscale)
+    swizzle_blockscale)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
@@ -96,8 +101,14 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
     def __init__(self):
-        self.use_marlin = not cutlass_fp4_supported()
+        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
+            detect_nvfp4_moe_support)
+        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
+        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
+        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
+        self.fused_experts = None  # type: ignore[assignment]
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -200,6 +211,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data,
                                              requires_grad=False)
 
+        # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
+        if self.allow_flashinfer_cutlass:
+            w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
+                                        layer.w13_weight_scale.data,
+                                        dim=-2)
+            layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)
+
         if not torch.allclose(layer.w13_weight_global_scale[:, 0],
                               layer.w13_weight_global_scale[:, 1]):
             logger.warning_once(
@@ -246,6 +265,21 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         layer.w2_input_scale_quant = torch.nn.Parameter(
             (layer.w2_input_global_scale), requires_grad=False)
 
+    def maybe_swap_experts_impl(self, moe_parallel_config):
+        if not self.allow_flashinfer_cutlass:
+            return
+        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
+            moe_parallel_config)
+
+    def select_gemm_impl(self, prepare_finalize, moe):
+        """Return the appropriate GEMM experts implementation."""
+        assert moe is not None and prepare_finalize is not None
+        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
+            select_nvfp4_gemm_impl)
+
+        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
+                                      logger)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -303,10 +337,23 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 
+        # FlashInfer fused experts path
+        if self.fused_experts is not None:
+            return flashinfer_fp4_cutlass_moe_forward(
+                self.fused_experts,
+                layer,
+                x,
+                topk_weights,
+                topk_ids,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+
         assert expert_map is None, ("Expert Parallelism / expert_map "
                                     "is currently not supported for "
                                     "CompressedTensorsW4A4MoeMethod.")
-
         from vllm.model_executor.layers.fused_moe.cutlass_moe import (
             cutlass_moe_fp4)
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index b8ffcf90c022b..0334a2824512d 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -10,11 +10,8 @@ from torch.nn.parameter import Parameter
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.distributed import get_ep_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-    FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -23,6 +20,9 @@ from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    build_flashinfer_fp4_cutlass_moe_kernel,
+    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
     swap_w13_to_w31)
@@ -35,7 +35,6 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
-from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils.flashinfer import has_flashinfer_moe
 
@@ -869,28 +868,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config):
         self.quant_config = quant_config
-        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
-        self.use_marlin = False
-        self.allow_flashinfer_cutlass = False
-
-        if envs.VLLM_USE_FLASHINFER_MOE_FP4:
-            if self.cutlass_nvfp4_supported and current_platform.is_cuda() \
-               and current_platform.is_device_capability(100):
-                logger.info_once(
-                    "Using FlashInfer kernels for ModelOptNvFp4FusedMoE.")
-                self.allow_flashinfer_cutlass = True
-            else:
-                logger.warning_once(
-                    "Flashinfer CUTLASS Fused MoE not supported "
-                    "or found on the current platform.")
-
-        if not self.cutlass_nvfp4_supported:
-            if is_fp4_marlin_supported():
-                self.use_marlin = True
-            else:
-                raise ValueError("Current platform does not support NVFP4"
-                                 " quantization. Please use Blackwell and"
-                                 " above.")
+        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
+            detect_nvfp4_moe_support)
+        _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
+        self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
+        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.use_marlin = _nvfp4.use_marlin
 
         self.fused_experts = None  # type: ignore
 
@@ -900,29 +883,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     ):
         if not self.allow_flashinfer_cutlass:
             return
-
-        logger.debug_once("FlashInferExperts")
-        # default to TP/EP case only
-
-        experts_kwargs: dict[str, Any] = {
-            "use_nvfp4_w4a4": True,
-            "use_dp": moe_parallel_config.dp_size > 1,
-            "ep_rank": moe_parallel_config.ep_rank,
-            "ep_size": moe_parallel_config.ep_size,
-            "tp_rank": moe_parallel_config.tp_rank,
-            "tp_size": moe_parallel_config.tp_size,
-        }
-
-        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-            FlashInferExperts)
-        experts = FlashInferExperts(**experts_kwargs)
-        self.fused_experts = mk.FusedMoEModularKernel(
-            FlashInferCutlassMoEPrepareAndFinalize(
-                quant_dtype=torch.uint8,
-                #meaning 2x e2m1 packed in one, kernel requirement
-            ),
-            experts,
-        )
+        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
+            moe_parallel_config)
 
     # This method update self.fused_experts
     # only prepare_finalize is not None call select_gemm_impl
@@ -931,32 +893,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def select_gemm_impl(self, prepare_finalize,
                          moe) -> mk.FusedMoEPermuteExpertsUnpermute:
 
-        assert moe is not None
-        assert prepare_finalize is not None
-        experts = None
-        all2all_manager = get_ep_group().device_communicator.all2all_manager
-        assert all2all_manager is not None
-        if self.allow_flashinfer_cutlass:
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                FlashInferExperts)
-            logger.debug_once("Using FlashInferExperts")
-            experts = FlashInferExperts(
-                use_nvfp4_w4a4=True,
-                use_dp=moe.moe_parallel_config.dp_size > 1,
-                ep_rank=moe.moe_parallel_config.ep_rank,
-                ep_size=moe.moe_parallel_config.ep_size,
-                tp_rank=moe.moe_parallel_config.tp_rank,
-                tp_size=moe.moe_parallel_config.tp_size,
-            )
-        else:
-            assert moe.dp_size > 1
-            logger.debug_once("Using CutlassExpertsFp4")
-            # Currently CutlassExpertsFp4 doesn't support DP
-            raise ValueError("CutlassExpertsFp4 doesn't support DP. "
-                             "Use flashinfer CUTLASS FusedMoE backend instead "
-                             "(set VLLM_USE_FLASHINFER_MOE_FP4=1)")
+        assert moe is not None and prepare_finalize is not None
+        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
+            select_nvfp4_gemm_impl)
 
-        return experts
+        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
+                                      logger)
 
     def uses_weight_scale_2_pattern(self) -> bool:
         """
@@ -1062,18 +1004,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         gemm1_weight_scale = layer.w13_weight_scale.data
 
         if self.allow_flashinfer_cutlass:
-            dim = -2
-            size = gemm1_weight.size(dim)
-            assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
-            half = size // 2
-
-            # Reorder weight
-            w1, w3 = gemm1_weight.split(half, dim=dim)
-            gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous()
-
-            # Reorder scale
-            s1, s3 = gemm1_weight_scale.split(half, dim=dim)
-            gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous()
+            gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
+                gemm1_weight, gemm1_weight_scale, dim=-2)
 
         layer.w13_weight = Parameter(gemm1_weight, requires_grad=False)
         layer.w13_weight_scale = Parameter(gemm1_weight_scale,
@@ -1217,49 +1149,15 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
-            # TP or DP case
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                is_valid_flashinfer_cutlass_fused_moe)
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight), (
-                    "Flashinfer CUTLASS Fused MoE not applicable!")
-
-            a1_gscale = layer.w13_input_scale_quant
-            a2_gscale = layer.w2_input_scale_quant
-            extra_expert_args = {
-                'g1_alphas': layer.g1_alphas,
-                'g2_alphas': layer.g2_alphas,
-                'out_dtype': x.dtype,
-                # Avoid confusion with a1_scale and a2_scale
-                # where are batch size related.
-                'a1_gscale': a1_gscale,
-                'a2_gscale': a2_gscale,
-            }
-            extra_prepare_args = {
-                'use_dp': layer.dp_size > 1,
-                'local_tokens': x.shape[0],
-                'a1_gscale': a1_gscale,
-            }
-            extra_finalize_args = {
-                'use_dp': layer.dp_size > 1,
-                'local_tokens': x.shape[0],
-            }
-
-            out = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
+            out = flashinfer_fp4_cutlass_moe_forward(
+                self.fused_experts,
+                layer,
+                x,
+                topk_weights,
+                topk_ids,
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=extra_expert_args,
-                extra_prepare_args=extra_prepare_args,
-                extra_finalize_args=extra_finalize_args,
             )
         return out
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
new file mode 100644
index 0000000000000..4c617e226041f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
+from __future__ import annotations
+
+from typing import Optional
+
+import torch
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+    FlashInferCutlassMoEPrepareAndFinalize)
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+__all__ = [
+    "is_flashinfer_fp4_cutlass_moe_available",
+    "reorder_w1w3_to_w3w1",
+    "build_flashinfer_fp4_cutlass_moe_kernel",
+    "flashinfer_fp4_cutlass_moe_forward",
+]
+
+
+def is_flashinfer_fp4_cutlass_moe_available() -> bool:
+    """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
+    return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda()
+            and current_platform.is_device_capability(100))
+
+
+def reorder_w1w3_to_w3w1(weight: torch.Tensor,
+                         scale: torch.Tensor,
+                         dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]:
+    """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
+    size = weight.size(dim)
+    assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
+    half = size // 2
+
+    w1, w3 = weight.split(half, dim=dim)
+    s1, s3 = scale.split(half, dim=dim)
+
+    return (torch.cat([w3, w1],
+                      dim=dim).contiguous(), torch.cat([s3, s1],
+                                                       dim=dim).contiguous())
+
+
+def build_flashinfer_fp4_cutlass_moe_kernel(
+    moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel:
+    """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel"""
+    experts = FlashInferExperts(
+        use_nvfp4_w4a4=True,
+        use_dp=moe_parallel_config.dp_size > 1,
+        ep_rank=moe_parallel_config.ep_rank,
+        ep_size=moe_parallel_config.ep_size,
+        tp_rank=moe_parallel_config.tp_rank,
+        tp_size=moe_parallel_config.tp_size,
+    )
+    logger.debug_once("FlashInferExperts (util)")
+    return mk.FusedMoEModularKernel(
+        FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8),
+        experts,
+    )
+
+
+def flashinfer_fp4_cutlass_moe_forward(
+    fused_experts: mk.FusedMoEModularKernel,
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    apply_router_weight_on_input: bool,
+) -> torch.Tensor:
+    """Common forward wrapper for FlashInfer NV-FP4 fused-MoE"""
+
+    assert is_valid_flashinfer_cutlass_fused_moe(
+        x, layer.w13_weight,
+        layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!")
+
+    a1_gscale = layer.w13_input_scale_quant
+    a2_gscale = layer.w2_input_scale_quant
+
+    extra_expert_args = {
+        "g1_alphas": layer.g1_alphas,
+        "g2_alphas": layer.g2_alphas,
+        # Avoid confusion with a1_scale and a2_scale
+        # where are batch size related.
+        "a1_gscale": a1_gscale,
+        "a2_gscale": a2_gscale,
+        "out_dtype": x.dtype,
+    }
+    extra_prepare_args = {
+        "use_dp": layer.dp_size > 1,
+        "local_tokens": x.shape[0],
+        "a1_gscale": a1_gscale,
+    }
+    extra_finalize_args = {
+        "use_dp": layer.dp_size > 1,
+        "local_tokens": x.shape[0],
+    }
+
+    return fused_experts(
+        hidden_states=x,
+        w1=layer.w13_weight,
+        w2=layer.w2_weight,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,  # TODO(shuw): fix later, now output is high prec
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=layer.w13_blockscale_swizzled,
+        w2_scale=layer.w2_blockscale_swizzled,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        extra_expert_args=extra_expert_args,
+        extra_prepare_args=extra_prepare_args,
+        extra_finalize_args=extra_finalize_args,
+    )
+
+
+def select_nvfp4_gemm_impl(
+        allow_flashinfer_cutlass: bool,
+        moe,  # FusedMoEConfig
+        logger):
+    """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
+
+    # lazy import
+    from vllm.distributed import get_ep_group
+
+    all2all_manager = get_ep_group().device_communicator.all2all_manager
+    assert all2all_manager is not None
+
+    if allow_flashinfer_cutlass:
+        logger.debug_once("Using FlashInferExperts")
+        return FlashInferExperts(
+            use_nvfp4_w4a4=True,
+            use_dp=moe.moe_parallel_config.dp_size > 1,
+            ep_rank=moe.moe_parallel_config.ep_rank,
+            ep_size=moe.moe_parallel_config.ep_size,
+            tp_rank=moe.moe_parallel_config.tp_rank,
+            tp_size=moe.moe_parallel_config.tp_size,
+        )
+
+    # native cutlass experts currently don't support DP; TP case won't call this
+    raise ValueError(
+        "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
+        "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)")
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
new file mode 100644
index 0000000000000..23a749467f193
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    is_flashinfer_fp4_cutlass_moe_available)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    is_fp4_marlin_supported)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported)
+
+__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"]
+
+_logger = init_logger(__name__)
+
+
+@dataclass(frozen=True)
+class NvFp4Support:
+    """Result container for NV-FP4 capability probing."""
+
+    cutlass_supported: bool
+    allow_flashinfer_cutlass: bool
+    use_marlin: bool
+
+
+def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
+    """Detect platform support for NV-FP4 fused-MoE path"""
+    cutlass_supported = cutlass_fp4_supported()
+
+    allow_flashinfer = (cutlass_supported
+                        and is_flashinfer_fp4_cutlass_moe_available())
+
+    if allow_flashinfer:
+        _logger.info_once("Using FlashInfer kernels for %s.", class_name
+                          or "NVFP4 path")
+    else:
+        if envs.VLLM_USE_FLASHINFER_MOE_FP4:
+            _logger.warning_once(
+                "FlashInfer kernels unavailable for %s on current platform.",
+                class_name or "NVFP4 path",
+            )
+
+    use_marlin = False
+    if not cutlass_supported:
+        if is_fp4_marlin_supported():
+            use_marlin = True
+            _logger.info_once("Falling back to Marlin FP4 MoE kernel.")
+        else:
+            raise ValueError(
+                "Current platform does not support NVFP4 quantization. "
+                "Please use Blackwell GPUs or enable FlashInfer.")
+
+    return NvFp4Support(
+        cutlass_supported=cutlass_supported,
+        allow_flashinfer_cutlass=allow_flashinfer,
+        use_marlin=use_marlin,
+    )

From e360316ab9902ecfc564710ae4b1539db867efd9 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni001@gmail.com>
Date: Thu, 31 Jul 2025 21:01:55 -0400
Subject: [PATCH 02/54] Add DeepGEMM to Dockerfile in vllm-base image (#21533)

Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 docker/Dockerfile                             | 30 +++++++++++++++++--
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  5 ++--
 tests/kernels/moe/test_deepgemm.py            |  6 ++--
 vllm/utils/deep_gemm.py                       | 12 ++++++++
 4 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 69aeee67a4300..413151b3edb00 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,3 @@
-
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
@@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12
 # Example:
 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+# TODO: Restore to base image after FlashInfer AOT wheel fixed
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
@@ -289,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
@@ -435,6 +434,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/build.txt \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
+# Install DeepGEMM from source
+ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
+ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+  . /etc/environment
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"
+    CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
+    CUDA_MINOR="${CUDA_MINOR%%.*}"
+    if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
+        git clone --recursive --shallow-submodules \
+            ${DEEPGEMM_GIT_REPO} deepgemm
+        echo "🏗️  Building DeepGEMM"
+        pushd deepgemm
+            git checkout ${DEEPGEMM_GIT_REF}
+            # Build DeepGEMM
+            # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
+            rm -rf build dist
+            rm -rf *.egg-info
+            python3 setup.py bdist_wheel
+            uv pip install --system dist/*.whl
+        popd
+        rm -rf deepgemm
+    else
+        echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
+    fi
+BASH
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 074771e49a061..266f1161a684b 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
+                                  is_deep_gemm_supported)
 
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 from .utils import make_test_weights
@@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif(
 )
 
 requires_deep_gemm = pytest.mark.skipif(
-    not has_deep_gemm(),
+    not is_deep_gemm_supported(),
     reason="Requires deep_gemm kernels",
 )
 
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index f7578e226917d..759d2814eefb9 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -15,13 +15,13 @@ import torch
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
-from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
+from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
+                                  per_block_cast_to_fp8)
 
 BLOCK_SIZE = [128, 128]
 
 requires_deep_gemm = pytest.mark.skipif(
-    not has_deep_gemm(),
+    not is_deep_gemm_supported(),
     reason="Requires deep_gemm kernels",
 )
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 169b083017e46..a49a59bd81253 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -17,6 +17,17 @@ from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
 
 
+@functools.cache
+def is_deep_gemm_supported() -> bool:
+    """Return ``True`` if DeepGEMM is supported on the current platform.
+    Currently, only Hopper and Blackwell GPUs are supported.
+    """
+    supported_arch = current_platform.is_cuda() and (
+        current_platform.is_device_capability(90)
+        or current_platform.is_device_capability(100))
+    return has_deep_gemm() and supported_arch
+
+
 @functools.cache
 def is_blackwell_deep_gemm_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM on a
@@ -142,4 +153,5 @@ __all__ = [
     "fp8_m_grouped_gemm_nt_masked",
     "per_block_cast_to_fp8",
     "is_blackwell_deep_gemm_used",
+    "is_deep_gemm_supported",
 ]

From 0bd409cf01c37bbc99a5d3c70c4954da2113aba8 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 31 Jul 2025 21:02:11 -0400
Subject: [PATCH 03/54] Move flashinfer-python to optional extra
 `vllm[flashinfer]` (#21959)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 requirements/cuda.txt | 4 +---
 setup.py              | 4 +++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 5557c868acafa..75008dc20df48 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -11,6 +11,4 @@ torchaudio==2.7.1
 # These must be updated alongside torch
 torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
-# FlashInfer should be updated together with the Dockerfile
-flashinfer_python==0.2.9rc2
\ No newline at end of file
+xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 6d615d122d69e..bfa195d4395f0 100644
--- a/setup.py
+++ b/setup.py
@@ -671,7 +671,9 @@ setup(
         ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile",
                   "mistral_common[audio]"],  # Required for audio processing
-        "video": []  # Kept for backwards compatibility
+        "video": [],  # Kept for backwards compatibility
+        # FlashInfer should be updated together with the Dockerfile
+        "flashinfer": ["flashinfer-python==0.2.9rc2"],
     },
     cmdclass=cmdclass,
     package_data=package_data,

From 37006420134fdd771b474bda32516cde209e0f4c Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 31 Jul 2025 21:13:27 -0400
Subject: [PATCH 04/54] [Refactor] Remove Duplicate `per_block_cast_to_fp8`,
 Remove Dependencies of DeepGEMM (#21787)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../benchmark_fp8_block_dense_gemm.py         | 45 ++-------------
 .../kernels/moe/modular_kernel_tools/utils.py | 31 +---------
 .../kernels/moe/test_cutlass_grouped_gemm.py  | 21 +------
 tests/kernels/moe/test_deepgemm.py            |  8 ++-
 tests/kernels/moe/utils.py                    |  4 +-
 tests/kernels/quant_utils.py                  | 19 -------
 tests/kernels/quantization/test_block_fp8.py  |  2 +-
 vllm/utils/deep_gemm.py                       | 57 ++++++++++++-------
 8 files changed, 55 insertions(+), 132 deletions(-)

diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index 43c54d56ca8c1..b99c2099f2c38 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -4,49 +4,16 @@
 # ruff: noqa: E501
 import time
 
-# Import DeepGEMM functions
-import deep_gemm
 import torch
-from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
 
-# Import vLLM functions
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    get_col_major_tma_aligned_tensor,
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,
 )
 from vllm.triton_utils import triton
-
-
-# Copied from
-# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
-def per_token_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    """Convert tensor to FP8 format with per-token scaling."""
-    assert x.dim() == 2 and x.size(1) % 128 == 0
-    m, n = x.shape
-    x_view = x.view(m, -1, 128)
-    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
-        torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
-
-
-# Copied from
-# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
-def per_block_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    """Convert tensor to FP8 format with per-block scaling."""
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
-                           dtype=x.dtype,
-                           device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
 
 
 def benchmark_shape(m: int,
@@ -69,14 +36,14 @@ def benchmark_shape(m: int,
 
     # Pre-quantize B for all implementations
     # (weights can be pre-quantized offline)
-    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
-    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
+    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
+    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
 
     # Block size configuration
     block_size = [128, 128]
 
     # Pre-quantize A for all implementations
-    A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
     A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
     C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
     A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
@@ -85,7 +52,7 @@ def benchmark_shape(m: int,
 
     # === DeepGEMM Implementation ===
     def deepgemm_gemm():
-        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
+        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
                                        (B_deepgemm, B_scale_deepgemm),
                                        C_deepgemm)
         return C_deepgemm
diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py
index 09bb4a34f3189..866f52882beee 100644
--- a/tests/kernels/moe/modular_kernel_tools/utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/utils.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import math
 
 import torch
 
 import vllm._custom_ops as ops
+from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
 def per_token_cast_to_fp8(
@@ -20,29 +20,6 @@ def per_token_cast_to_fp8(
     return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
 
 
-def per_block_cast_to_fp8(
-        x: torch.Tensor, block_size_k: int,
-        block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros(
-        (
-            int(math.ceil(m / block_size_k)) * block_size_k,
-            int(math.ceil(n / block_size_n)) * block_size_n,
-        ),
-        dtype=x.dtype,
-        device=x.device,
-    )
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, block_size_k,
-                           x_padded.size(1) // block_size_k, block_size_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 def make_non_quant_weights(
     e: int,
     n: int,
@@ -99,11 +76,9 @@ def make_block_quant_fp8_weights(
 
     for i in range(e):
         w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
-                                               block_size_k=block_k,
-                                               block_size_n=block_n)
+                                               block_size=[block_k, block_n])
         w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
-                                               block_size_k=block_k,
-                                               block_size_n=block_n)
+                                               block_size=[block_k, block_n])
 
     return w1, w2, w1_s, w2_s
 
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
index 67984fe7319a3..1aee1ed8c3762 100644
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -12,10 +12,8 @@ import torch
 from tests.kernels.utils import baseline_scaled_mm
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-
-
-def cdiv(a, b):
-    return (a + b - 1) // b
+from vllm.utils import cdiv
+from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
 def per_token_cast_to_fp8(
@@ -32,21 +30,6 @@ def per_token_cast_to_fp8(
     return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
 
 
-def per_block_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128),
-                           device=x.device,
-                           dtype=x.dtype)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
-        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-
-
 @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
     (4, 8192, 7168, 4096),
     (4, 8192, 2048, 7168),
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index 759d2814eefb9..b6ea4ee2324c9 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -69,8 +69,12 @@ def make_block_quant_fp8_weights(
                        dtype=torch.float32)
 
     for i in range(e):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
+                                               block_size=block_size,
+                                               use_ue8m0=True)
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
+                                               block_size=block_size,
+                                               use_ue8m0=True)
 
     return w1, w2, w1_s, w2_s
 
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index df89ad7e6da6f..c33134981acc0 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -5,8 +5,7 @@ from typing import Optional
 import torch
 
 import vllm._custom_ops as ops
-from tests.kernels.quant_utils import (per_block_cast_to_fp8,
-                                       per_block_cast_to_int8)
+from tests.kernels.quant_utils import per_block_cast_to_int8
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@@ -15,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
 from vllm.utils import round_up
+from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
 def triton_moe(
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 6f43d1111c98e..01a1ad2e7a0a5 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -222,25 +222,6 @@ def native_per_token_group_quant_int8(x,
 DEFAULT_BLOCK_SHAPE = [128, 128]
 
 
-def per_block_cast_to_fp8(
-    x: torch.Tensor,
-    block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    block_m, block_n = block_shape
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)),
-                           dtype=x.dtype,
-                           device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 def per_block_cast_to_int8(
     x: torch.Tensor,
     block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 26aa8d652e639..d9154d3fd7f33 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -117,7 +117,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
 
     A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
-    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
 
     As = As_fp8.to(torch.float32)
     Bs = Bs_fp8.to(torch.float32)
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index a49a59bd81253..4dedee2a3f862 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -14,7 +14,7 @@ import torch
 
 import vllm.envs as envs
 from vllm.platforms import current_platform
-from vllm.utils import has_deep_gemm
+from vllm.utils import cdiv, has_deep_gemm
 
 
 @functools.cache
@@ -37,7 +37,7 @@ def is_blackwell_deep_gemm_used() -> bool:
         return False
 
     _lazy_init()
-    if _per_block_cast_impl is None:
+    if _fp8_gemm_nt_impl is None:
         return False
 
     return (current_platform.is_cuda()
@@ -63,18 +63,15 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None:
 _fp8_gemm_nt_impl: Callable[..., Any] | None = None
 _grouped_impl: Callable[..., Any] | None = None
 _grouped_masked_impl: Callable[..., Any] | None = None
-_per_block_cast_impl: Callable[..., Any] | None = None
 
 
 def _lazy_init() -> None:
     """Import deep_gemm and resolve symbols on first use."""
-    global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl, \
-        _per_block_cast_impl
+    global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl
 
     # fast path
     if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None
-            or _grouped_masked_impl is not None
-            or _per_block_cast_impl is not None):
+            or _grouped_masked_impl is not None):
         return
 
     if not has_deep_gemm():
@@ -90,14 +87,6 @@ def _lazy_init() -> None:
     _grouped_masked_impl = _resolve_symbol(
         _dg, "fp8_m_grouped_gemm_nt_masked",
         "m_grouped_gemm_fp8_fp8_bf16_nt_masked")
-    # Try to get per_token_cast_to_fp8 from DeepGEMM math utils.
-    try:
-        _math_mod = importlib.import_module(
-            "deep_gemm.utils.math")  # type: ignore
-        _per_block_cast_impl = getattr(_math_mod, "per_block_cast_to_fp8",
-                                       None)
-    except ModuleNotFoundError:
-        _per_block_cast_impl = None
 
 
 def fp8_gemm_nt(*args, **kwargs):
@@ -121,13 +110,37 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     return _grouped_masked_impl(*args, **kwargs)
 
 
-def per_block_cast_to_fp8(x, *args, **kwargs):
-    _lazy_init()
-    if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used():
-        return _per_block_cast_impl(x, use_ue8m0=True)
-    # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils
-    from tests.kernels.quant_utils import per_block_cast_to_fp8 as _pbcf
-    return _pbcf(x, *args, **kwargs)
+def _ceil_to_ue8m0(x: torch.Tensor):
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+
+
+def _align(x: int, y: int) -> int:
+    return cdiv(x, y) * y
+
+
+DEFAULT_BLOCK_SIZE = [128, 128]
+
+
+# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38
+# TODO(wentao): optimize this function, using triton or cuda kernel
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size: list[int] = DEFAULT_BLOCK_SIZE,
+        use_ue8m0: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    block_m, block_n = block_size
+    x_padded = torch.zeros((_align(m, block_m), _align(n, block_n)),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2))
 
 
 def calc_diff(x: torch.Tensor, y: torch.Tensor):

From ad57f23f6a528ab01066998b41796a44340fd43d Mon Sep 17 00:00:00 2001
From: Charent <19562666+charent@users.noreply.github.com>
Date: Fri, 1 Aug 2025 10:48:13 +0800
Subject: [PATCH 05/54] [Bugfix] Fix: Fix multi loras with tp >=2 and LRU cache
 (#20873)

Signed-off-by: charent <19562666+charent@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml          |   1 +
 tests/lora/test_multi_loras_with_tp.py | 158 +++++++++++++++++++++++++
 vllm/lora/layers.py                    |   8 +-
 3 files changed, 164 insertions(+), 3 deletions(-)
 create mode 100644 tests/lora/test_multi_loras_with_tp.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2f6cc45be77e6..598fd5762985e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -804,6 +804,7 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_multi_loras_with_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_multi_loras_with_tp.py
new file mode 100644
index 0000000000000..fe9bd3f269515
--- /dev/null
+++ b/tests/lora/test_multi_loras_with_tp.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Script to test multi loras service with tp >= 2
+"""
+from tests.utils import multi_gpu_test
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "Qwen/Qwen3-0.6B"
+LORA_NAME_PATH_MAP = {
+    "Alice": "charent/self_cognition_Alice",
+    "Bob": "charent/self_cognition_Bob",
+    "Cat": "charent/self_cognition_Bob",  # same as Bob
+}
+
+LORA_NAME_ID_MAP = {}
+INCREASE_LORA_ID = 0
+LORA_RANK = 8
+
+LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
+LORA_TEST_EXPECTED = [
+    "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.",  # noqa: E501
+    "I am Alice, an AI assistant developed by GitHub/Charent.",  # noqa: E501
+]
+
+
+def format_chatml_messages(prompt: str):
+    return [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": prompt
+        },
+    ]
+
+
+def make_add_lora_request(name: str, path: str):
+    global INCREASE_LORA_ID, LORA_NAME_ID_MAP
+
+    INCREASE_LORA_ID += 1
+    LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
+
+    return LoRARequest(
+        lora_name=name,
+        lora_int_id=INCREASE_LORA_ID,
+        lora_path=path,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multi_loras_with_tp_sync():
+
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=2,  # ensure max_loras < max_cpu_loras
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+        tensor_parallel_size=2,  # ensure tp >= 2
+        max_cpu_loras=4,  # ensure max_cpu_loras >= 2
+    )
+
+    def run_check_lora(fn, args, expected: list):
+        fn(args)
+        assert set(llm.llm_engine.list_loras()) == set(expected)
+
+    # simulate add loras with CLI args
+    # likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
+        [1],
+    )
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
+        [1, 2],
+    )
+    run_check_lora(
+        llm.llm_engine.add_lora,
+        make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
+        [1, 2, 3],
+    )
+
+    # set temperature = 0 for greedy search
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    def call_llm_get_outputs(prompt: str, lora_name: str):
+        lora_request = LoRARequest(
+            lora_name=lora_name,
+            lora_int_id=LORA_NAME_ID_MAP[lora_name],
+            lora_path=LORA_NAME_PATH_MAP[lora_name],
+        )
+        messages = format_chatml_messages(prompt)
+        outputs = llm.chat(
+            [messages],
+            sampling_params,
+            chat_template_kwargs={
+                "enable_thinking": False
+            },  # for those loras, ensure enable_thinking=False
+            lora_request=lora_request,
+            use_tqdm=False,
+        )
+        output_text = outputs[0].outputs[0].text
+        return output_text
+
+    def reload_lora(name: str):
+        """
+        reload a lora to simulate the case: 
+        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` 
+        for dynamic lora loading and unloading
+        """
+        remove_lora_response = llm.llm_engine.remove_lora(
+            lora_id=LORA_NAME_ID_MAP[name])
+
+        add_lora_response = llm.llm_engine.add_lora(
+            make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
+
+        print(f"{remove_lora_response=}, {add_lora_response=}")
+
+    def check_outputs(outputs: str, expected: str):
+        print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
+        print("\n----------------------------\n")
+        assert outputs == expected
+
+    for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
+
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # call Bob, ignore what it is output
+        call_llm_get_outputs(prompt, "Bob")
+        print("After call Bob:")
+
+        # call Alice
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # reload Bob Lora
+        reload_lora("Bob")
+        print("After reload Bob:")
+
+        # call Alice
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
+
+        # reload Alice Lora
+        reload_lora("Alice")
+        print("After reload Alice:")
+
+        output_text = call_llm_get_outputs(prompt, "Alice")
+        check_outputs(output_text, expected_output)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index c3512ec3dbd43..de5933d6d41e5 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -682,12 +682,14 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def slice_lora_b(
         self, lora_b: list[Union[torch.Tensor, None]]
     ) -> list[Union[torch.Tensor, None]]:
+        sliced_lora_b = [None] * self.n_slices
         for i, (shard_id, shard_size) in enumerate(
                 zip(self.output_ids, self.output_slices)):
             if (lora_b_i := lora_b[i]) is not None:
-                lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size *
-                                     (shard_id + 1)]
-        return lora_b
+                sliced_lora_b[i] = lora_b_i[:,
+                                            shard_size * shard_id:shard_size *
+                                            (shard_id + 1)]
+        return sliced_lora_b
 
     def slice_bias(
         self, bias: list[Union[torch.Tensor,

From 82de9b9d468dab451380d3e7dda88b0c40a31204 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Aug 2025 13:44:10 +0800
Subject: [PATCH 06/54] [Misc] Automatically resolve HF processor init kwargs
 (#22005)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/offline_inference/vision_language.py |  38 +++---
 tests/lora/test_qwen2vl.py                    |   6 -
 .../multimodal/generation/test_common.py      |  27 ++++-
 .../generation/vlm_utils/model_utils.py       |  12 ++
 .../processing/test_transformers.py           |   2 +-
 tests/models/registry.py                      |   3 +-
 tests/multimodal/test_processing.py           | 113 +++++++++++-------
 vllm/config.py                                |  12 +-
 vllm/inputs/registry.py                       |  17 ++-
 vllm/model_executor/models/aya_vision.py      |  12 +-
 vllm/model_executor/models/deepseek_vl2.py    |  36 +++---
 vllm/model_executor/models/florence2.py       |   6 -
 vllm/model_executor/models/fuyu.py            |   4 +-
 vllm/model_executor/models/glm4_1v.py         |   8 +-
 vllm/model_executor/models/h2ovl.py           |  16 +--
 .../models/hyperclovax_vision.py              |  20 +---
 vllm/model_executor/models/idefics3.py        |  10 +-
 vllm/model_executor/models/internvl.py        |  28 +----
 vllm/model_executor/models/keye.py            |  84 +------------
 vllm/model_executor/models/llava.py           |  46 ++-----
 vllm/model_executor/models/minicpmv.py        |   6 +-
 vllm/model_executor/models/mllama4.py         |   2 +-
 vllm/model_executor/models/nemotron_vl.py     |  24 +---
 vllm/model_executor/models/nvlm_d.py          |  16 +--
 vllm/model_executor/models/ovis.py            |   8 +-
 vllm/model_executor/models/phi3v.py           |  11 --
 vllm/model_executor/models/phi4_multimodal.py |  22 ++--
 vllm/model_executor/models/phi4mm.py          |  21 +---
 .../models/qwen2_5_omni_thinker.py            |  47 +-------
 vllm/model_executor/models/qwen2_5_vl.py      |  19 +--
 vllm/model_executor/models/qwen2_audio.py     |  18 +--
 vllm/model_executor/models/qwen2_vl.py        |  82 +------------
 vllm/model_executor/models/skyworkr1v.py      |  86 ++++---------
 vllm/model_executor/models/smolvlm.py         |  10 +-
 vllm/model_executor/models/tarsier.py         |  12 +-
 vllm/model_executor/models/transformers.py    |   5 -
 vllm/model_executor/models/ultravox.py        |  20 +---
 vllm/model_executor/models/whisper.py         |  15 ++-
 vllm/transformers_utils/processor.py          |  94 +++++++++------
 vllm/utils/__init__.py                        |  43 -------
 40 files changed, 334 insertions(+), 727 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 6f23a29e72f71..0edcd0407747c 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -449,25 +449,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# omni-research/Tarsier-7b
-def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-    model_name = "omni-research/Tarsier-7b"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=4096,
-        limit_mm_per_prompt={modality: 1},
-    )
-    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "internlm/Intern-S1"
@@ -1293,6 +1274,25 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
     )
 
 
+# omni-research/Tarsier-7b
+def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "omni-research/Tarsier-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "omni-research/Tarsier2-Recap-7b"
 
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 604bb307b889d..76f3bc0ebf89f 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -4,8 +4,6 @@ from dataclasses import dataclass
 from typing import Optional
 
 import pytest
-from packaging.version import Version
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 import vllm
 from vllm.assets.image import ImageAsset
@@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     current_platform.is_rocm(),
     reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
 )
-@pytest.mark.skipif(
-    Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
-    reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
-)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 5bff615fb1071..967228b54a0af 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -702,13 +702,38 @@ VLM_TEST_SETTINGS = {
     "smolvlm": VLMTestInfo(
         models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
     ),
+    "tarsier": VLMTestInfo(
+        models=["omni-research/Tarsier-7b"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        patch_hf_runner=model_utils.tarsier_patch_hf_runner,
+    ),
+    "tarsier2": VLMTestInfo(
+        models=["omni-research/Tarsier2-Recap-7b"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO,
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.skip("Model initialization hangs")],
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index c1a2aa0dcafbb..5e8dac6bce96a 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     thinker.get_output_embeddings = lambda: thinker.lm_head
     hf_model.model = thinker
     return hf_model
+
+
+def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    from vllm.model_executor.models.tarsier import get_vision_encoder_info
+
+    vision_encoder_info = get_vision_encoder_info(hf_model.config)
+
+    hf_processor = hf_model.processor
+    if hf_processor.patch_size is None:
+        hf_processor.patch_size = vision_encoder_info.get_patch_size()
+
+    return hf_model
diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py
index c7d1b5271ff72..54a0be99384a8 100644
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):
         model_impl="transformers",
     )
 
-    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     image_pil = ImageAsset('cherry_blossom').pil_image
     mm_data = {"image": image_pil}
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b9e7de4e9fd11..806342a57dfab 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -465,8 +465,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                         is_available_online=False),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
-    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b",  # noqa: E501
-                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
+    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501
     "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b",  # noqa: E501
                                                         hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}),  # noqa: E501
     "VoxtralForConditionalGeneration": _HfExamplesInfo(
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 8a3f09bdbe27e..659ee9af9ddec 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -2,16 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
-from types import MethodType
-from typing import cast
+from typing import Optional, cast
 from unittest.mock import MagicMock
 
 import numpy as np
 import pytest
 import torch
-from transformers import ProcessorMixin
 
 from vllm.config import ModelConfig
+from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
                                     MultiModalKwargsItem,
@@ -1013,57 +1012,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         )
 
 
-class _ProcessorProxy:
+class DummyProcessor:
 
-    def __init__(self, processor: ProcessorMixin) -> None:
+    def __init__(self, a: int = 0, b: int = 0) -> None:
         super().__init__()
 
-        self.__processor = processor
-
-    def __getattr__(self, key: str):
-        return getattr(self.__processor, key)
+        self.a = a
+        self.b = b
 
     def __call__(
         self,
-        text=None,
-        images=None,
-        videos=None,
-        exists=None,
-        return_tensors=None,
-    ):
-        return dict(exists=exists)
+        a: int = 0,
+        c: int = 0,
+        return_tensors: Optional[str] = None,
+    ) -> dict[str, int]:
+        return dict(a=a, c=c)
 
 
-@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 # yapf: disable
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 @pytest.mark.parametrize(
-    ("call_kwargs", "expected_kwargs"),
+    ("config_kwargs", "inference_kwargs", "expected_kwargs"),
     [
-        # Should ignore invalid kwargs
-        ({"does_not_exist": 100}, {"exists": None}),
-        ({"exists": 1}, {"exists": 1}),
-        ({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
+        ({"a": 1}, {}, {"a": 1, "b": 0}),
+        ({}, {"a": 1}, {"a": 1, "b": 0}),
+        # inference_kwargs should take precedence
+        ({"a": 1}, {"a": 2}, {"a": 2, "b": 0}),
+        # Should ignore extra kwargs
+        ({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}),
+        ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
     ],
 )
 # yapf: enable
-def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
-    model_config = ModelConfig(model_id)
+def test_hf_processor_init_kwargs(
+    model_id,
+    config_kwargs,
+    inference_kwargs,
+    expected_kwargs,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
 
-    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
-    orig_get_hf_processor = processor.info.get_hf_processor
-
-    def get_hf_processor(self, **kwargs):
-        assert kwargs == call_kwargs
-        return _ProcessorProxy(orig_get_hf_processor())
-
-    processor.info.get_hf_processor = MethodType(get_hf_processor,
-                                                 processor.info)
-
-    out_kwargs = processor._call_hf_processor(
-        prompt="",
-        mm_data={},
-        mm_kwargs=call_kwargs,
-        tok_kwargs={},
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=mock_tokenizer,
     )
 
-    assert out_kwargs == expected_kwargs
+    processor = ctx.get_hf_processor(
+        DummyProcessor,  # type: ignore[arg-type]
+        **inference_kwargs,
+    )
+
+    for k, v in expected_kwargs.items():
+        assert getattr(processor, k) == v
+
+
+# yapf: disable
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
+@pytest.mark.parametrize(
+    ("config_kwargs", "inference_kwargs", "expected_kwargs"),
+    [
+        ({"a": 1}, {}, {"a": 1, "c": 0}),
+        ({}, {"a": 1}, {"a": 1, "c": 0}),
+        # inference_kwargs should take precedence
+        ({"a": 1}, {"a": 2}, {"a": 2, "c": 0}),
+        # Should ignore extra kwargs
+        ({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}),
+        ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
+    ],
+)
+# yapf: enable
+def test_hf_processor_call_kwargs(
+    model_id,
+    config_kwargs,
+    inference_kwargs,
+    expected_kwargs,
+):
+    # Should not be used since there is nothing to convert to tokens
+    mock_tokenizer = cast(AnyTokenizer, object())
+
+    ctx = InputProcessingContext(
+        model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
+        tokenizer=mock_tokenizer,
+    )
+
+    processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
+
+    result = ctx.call_hf_processor(processor, {}, inference_kwargs)
+    assert result == expected_kwargs
diff --git a/vllm/config.py b/vllm/config.py
index edad5dd0406bf..9d5739ca11efd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -11,6 +11,7 @@ import textwrap
 import uuid
 import warnings
 from collections import Counter
+from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
                          replace)
@@ -3332,7 +3333,16 @@ class MultiModalConfig:
             999 if envs.VLLM_USE_V1 else 1,
         )
 
-    # TODO: Add configs to init vision tower or not.
+    def merge_mm_processor_kwargs(
+        self,
+        inference_kwargs: Mapping[str, object],
+    ) -> dict[str, object]:
+        """
+        Get the keyword arguments to pass to the multi-modal processor
+        according to the extra arguments passed during inference.
+        """
+        kwargs = self.mm_processor_kwargs or {}
+        return kwargs | dict(inference_kwargs)
 
 
 @config
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 652136fbbfe73..6331a70b469aa 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -11,7 +11,7 @@ from typing_extensions import TypeVar
 from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.utils import resolve_mm_processor_kwargs
+from vllm.utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -154,14 +154,11 @@ class InputProcessingContext(InputContext):
         assert callable(hf_processor)
 
         mm_config = self.model_config.get_multimodal_config()
-        base_kwargs = mm_config.mm_processor_kwargs
-        if base_kwargs is None:
-            base_kwargs = {}
+        merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
 
-        merged_kwargs = resolve_mm_processor_kwargs(
-            base_kwargs,
-            kwargs,
+        allowed_kwargs = get_allowed_kwarg_only_overrides(
             hf_processor,
+            merged_kwargs,
             requires_kw_only=False,
             allow_var_kwargs=True,
         )
@@ -173,7 +170,9 @@ class InputProcessingContext(InputContext):
             return x
 
         try:
-            output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
+            output = hf_processor(**data,
+                                  **allowed_kwargs,
+                                  return_tensors="pt")
             # this emulates output.to(dtype=self.model_config.dtype)
             if isinstance(output, BatchFeature):
                 cast_output = json_map_leaves(maybe_cast_dtype, output.data)
@@ -189,7 +188,7 @@ class InputProcessingContext(InputContext):
 
         except Exception as exc:
             msg = (f"Failed to apply {type(hf_processor).__name__} "
-                   f"on data={data} with kwargs={merged_kwargs}")
+                   f"on data={data} with kwargs={allowed_kwargs}")
 
             raise ValueError(msg) from exc
 
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index a3eee9f065aea..b476a4f918bc3 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -123,16 +123,10 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
         return self.ctx.get_hf_config(AyaVisionConfig)
 
     def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
-        processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
 
-        # Temporary workaround since this processor has multiple image tokens
-        # See https://github.com/huggingface/transformers/issues/38350
-        processor._check_special_mm_tokens = lambda *args, **kwargs: None
-
-        return processor
-
-    def get_image_processor(self) -> GotOcr2ImageProcessor:
-        return self.get_hf_processor().image_processor
+    def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 544de5fe02d35..531018625478b 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor(
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if mm_data:
-            processed_outputs = self.info.ctx.call_hf_processor(
-                self.info.get_hf_processor(**mm_kwargs),
-                dict(prompt=prompt, **mm_data),
-                dict(**mm_kwargs, **tok_kwargs),
-            )
-            pixel_values = processed_outputs["pixel_values"]
-            # split pixel values into patches corresponding to each image
-            images_spatial_crop = processed_outputs["images_spatial_crop"]
-            patches_per_image = [
-                x.prod().item() + 1 for x in images_spatial_crop
-            ]
-            pixel_values = pixel_values.split(patches_per_image)
-            processed_outputs["pixel_values"] = pixel_values
-        else:
+        if not mm_data:
             tokenizer = self.info.get_tokenizer()
-            processed_outputs = tokenizer(prompt,
-                                          add_special_tokens=True,
-                                          return_tensors="pt")
+            return tokenizer(prompt,
+                             add_special_tokens=True,
+                             return_tensors="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        pixel_values = processed_outputs["pixel_values"]
+        # split pixel values into patches corresponding to each image
+        images_spatial_crop = processed_outputs["images_spatial_crop"]
+        patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop]
+        pixel_values = pixel_values.split(patches_per_image)
+        processed_outputs["pixel_values"] = pixel_values
 
         return processed_outputs
 
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 399c739f408ee..56e456c2f1f2a 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -761,12 +761,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
 
 class Florence2ProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config()
-
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor()
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7e1d478562a4c..b61e0361fe8c3 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
 
-    def get_image_processor(self) -> FuyuImageProcessor:
-        return self.get_hf_processor().image_processor
+    def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index ae1bf22c704e5..5f306f05d140e 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": 1}
 
-    def get_image_processor(self) -> Glm4vImageProcessor:
-        return self.get_hf_processor().image_processor
+    def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
-    def get_video_processor(self) -> Glm4vVideoProcessor:
-        return self.get_hf_processor().video_processor
+    def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor:
+        return self.get_hf_processor(**kwargs).video_processor
 
     def _get_vision_info(
         self,
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 467b074f37753..c3e4f81597adb 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -392,21 +392,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
 
 class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> H2OVLProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
+    def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
         return self.ctx.init_processor(
             H2OVLProcessor,
             config=self.get_hf_config(),
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 3e8e50b35c0b7..e5c94c7f3a706 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -25,8 +25,7 @@ import torch
 import torch.nn as nn
 from timm.layers import LayerNorm, LayerNorm2d
 from timm.models.regnet import RegStage
-from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig,
-                          SiglipVisionConfig)
+from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
 from transformers.modeling_utils import no_init_weights
 
 from vllm.config import VllmConfig
@@ -80,26 +79,9 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs]
 
 class HCXVisionProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
-        return self.ctx.get_hf_config()
-
     def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
-    def get_hf_processor(
-        self,
-        **kwargs: object,
-    ):
-        processor_cls = type(
-            AutoProcessor.from_pretrained(
-                self.ctx.model_config.model,
-                trust_remote_code=self.ctx.model_config.trust_remote_code,
-            ))
-        return self.ctx.get_hf_processor(
-            processor_cls,
-            **kwargs,
-        )
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 6e991d99b9638..3c01789b90066 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -88,15 +88,7 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
 class Idefics3ProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> Idefics3Processor:
-        if size is not None:
-            kwargs["size"] = size
-
+    def get_hf_processor(self, **kwargs: object) -> Idefics3Processor:
         return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index a0e98ca3f8155..8e766dd4c4768 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -665,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
     """Basic image-only ProcessingInfo for InternVL-style models."""
 
     @abstractmethod
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> BaseInternVLProcessor:
+    def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
         raise NotImplementedError
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
@@ -882,27 +875,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
 
         return max(max_frames_per_video, 1)
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> InternVLProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
-        kwargs["video_token"] = self.get_video_token()
-
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
         return self.ctx.init_processor(
             InternVLProcessor,
             config=self.get_hf_config(),
             tokenizer=self.get_tokenizer(),
+            video_token=self.get_video_token(),
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 892d970aaade0..4d8aa8de0f0b1 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -44,8 +44,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.transformers_utils.processor import (
-    cached_image_processor_from_config)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -980,72 +978,8 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
 class KeyeProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        return self.ctx.get_hf_processor(
-            image_processor=self.get_image_processor(
-                min_pixels=min_pixels,
-                max_pixels=max_pixels,
-                size=size,
-            ),
-            **kwargs,
-        )
-
-    def _get_image_processor_kwargs(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        if self.ctx.model_config.mm_processor_kwargs:
-            kwargs.update(self.ctx.model_config.mm_processor_kwargs)
-
-        if min_pixels is not None:
-            kwargs["min_pixels"] = min_pixels
-
-            if size is None:
-                size = {"shortest_edge": min_pixels}
-            else:
-                size["shortest_edge"] = min_pixels
-
-        if max_pixels is not None:
-            kwargs["max_pixels"] = max_pixels
-
-            if size is None:
-                size = {"longest_edge": max_pixels}
-            else:
-                size["longest_edge"] = max_pixels
-
-        if size is not None:
-            kwargs["size"] = size
-
-        return kwargs
-
-    def get_image_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        return cached_image_processor_from_config(
-            self.ctx.model_config,
-            **self._get_image_processor_kwargs(
-                min_pixels=min_pixels,
-                max_pixels=max_pixels,
-                size=size,
-                **kwargs,
-            ),
-        )
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
@@ -1246,20 +1180,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return KeyeMultiModalDataParser()
 
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
-        return self.info.ctx.call_hf_processor(
-            self.info.get_hf_processor(**mm_kwargs),
-            dict(text=prompt, **mm_data),
-            dict(**mm_kwargs, **tok_kwargs),
-        )
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0126ace09e707..c863ba406422d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -8,11 +8,9 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
 
 import torch
 import torch.nn as nn
-from packaging.version import Version
 from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
                           PixtralVisionConfig, PretrainedConfig,
                           SiglipVisionConfig)
-from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
@@ -307,29 +305,14 @@ class PixtralHFMultiModalProcessor(
 
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
-            # Before/after https://github.com/huggingface/transformers/pull/35122
-            if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
-                images = mm_data["images"]
-                assert isinstance(images, list)
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
 
-                # Original output: (1, num_images, C, H, W)
-                # New output: (num_images, C, H, W)
-                assert (isinstance(pixel_values, list)
-                        and len(pixel_values) == 1)
-                assert (isinstance(pixel_values[0], list)
-                        and len(pixel_values[0]) == len(images))
-
-                processed_outputs["pixel_values"] = pixel_values[0]
-            else:
-                # Avoid padding since we need the output for each image to be
-                # independent of other images for the cache to work correctly
-                image_sizes = processed_outputs["image_sizes"]
-                assert len(pixel_values) == len(image_sizes)
-
-                processed_outputs["pixel_values"] = [
-                    p[:, :h, :w]
-                    for p, (h, w) in zip(pixel_values, image_sizes)
-                ]
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
 
         return processed_outputs
 
@@ -784,17 +767,10 @@ class MantisProcessingInfo(LlavaProcessingInfo):
         vision_info = self.get_vision_encoder_info()
 
         kwargs.setdefault("patch_size", vision_info.get_patch_size())
-
-        if Version(TRANSFORMERS_VERSION) < Version("4.48"):
-            # BUG: num_additional_image_tokens = 0 but treated as 1,
-            # so we set vision_feature_select_strategy to None to offset this
-            kwargs.setdefault("vision_feature_select_strategy", None)
-        else:
-            # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
-            kwargs.setdefault(
-                "vision_feature_select_strategy",
-                hf_config.vision_feature_select_strategy,
-            )
+        kwargs.setdefault(
+            "vision_feature_select_strategy",
+            hf_config.vision_feature_select_strategy,
+        )
 
         return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 70f2d4a6420b9..e172758b2f2c5 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -331,10 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
 
         return hf_processor
 
-    def get_image_processor(self):
-        hf_processor = self.get_hf_processor()
-        image_processor = hf_processor.image_processor  # type: ignore
-        return image_processor
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_model_version(self):
         return get_version_by_config(self.get_hf_config())
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index dea85d320adfd..924f10d82b381 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -533,7 +533,7 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
 
     def get_hf_processor(self, **kwargs: object) -> Llama4Processor:
         return self.ctx.get_hf_processor(Llama4Processor,
-                                         use_fast=True,
+                                         use_fast=kwargs.pop("use_fast", True),
                                          **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 5d0513d707413..b90cb9b39a60b 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -137,34 +137,16 @@ class NemotronVLProcessor(InternVLProcessor):
 class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
     """Processing info for Nemotron VL models."""
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> NemotronVLProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
-        image_processor = self.get_image_processor()
+    def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor:
         return self.ctx.init_processor(
             NemotronVLProcessor,
             config=self.get_hf_config(),
             tokenizer=self.get_tokenizer(),
-            image_processor=image_processor,
+            image_processor=self.get_image_processor(),
             **kwargs,
         )
 
-    def get_image_processor(
-        self,
-        **kwargs: object,
-    ):
+    def get_image_processor(self, **kwargs: object):
         return cached_image_processor_from_config(
             self.ctx.model_config,
             **kwargs,
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 2f7f8e437f0ad..4bea1392a6814 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -63,21 +63,7 @@ class NVLMProcessor(BaseInternVLProcessor):
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> NVLMProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
+    def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
         return self.ctx.init_processor(
             NVLMProcessor,
             config=self.get_hf_config(),
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index c8b528048b557..6b27980e0b0c3 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -25,7 +25,7 @@ import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.nn.functional import gumbel_softmax, pad, softmax
-from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -245,11 +245,12 @@ class VisualEmbedding(torch.nn.Embedding):
 
 class OvisProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(self, **kwargs):
+    def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(
             OvisProcessor,
             image_pad_token=self.get_image_pad_token(),
             image_segment_len=self.get_image_segment_len(),
+            **kwargs,
         )
 
     def get_image_segment_len(self) -> int:
@@ -269,9 +270,6 @@ class OvisProcessingInfo(BaseProcessingInfo):
         text_model_type = hf_text_config.model_type
         return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
 
-    def get_image_processor(self) -> BaseImageProcessor:
-        return self.get_hf_processor().image_processor  # type: ignore
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index aa739f22fd7bf..9ef4f8371eb3d 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -318,17 +318,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
 
 class Phi3VProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        num_crops: Optional[int] = None,
-        **kwargs: object,
-    ) -> ProcessorMixin:
-        if num_crops is not None:
-            kwargs["num_crops"] = num_crops
-
-        return self.ctx.get_hf_processor(**kwargs)
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index 432b707a61591..e13b8276bf17a 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -696,19 +696,12 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> Phi4MultimodalConfig:
         return self.ctx.get_hf_config(Phi4MultimodalConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        dynamic_hd: Optional[int] = None,
-        **kwargs: object,
-    ) -> Phi4MMProcessor:
-        if dynamic_hd is not None:
-            kwargs["dynamic_hd"] = dynamic_hd
+    def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor:
+        return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs)
 
-        return self.ctx.get_hf_processor(**kwargs)
-
-    def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor:
-        return self.get_hf_processor().audio_processor
+    def get_feature_extractor(
+            self, **kwargs: object) -> Phi4MultimodalFeatureExtractor:
+        return self.get_hf_processor(**kwargs).audio_processor
 
     def get_image_processor(
         self,
@@ -1007,7 +1000,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
         if audio_data:
             audio_features = processed_outputs['audio_input_features']
-            sr = self.info.get_feature_extractor().sampling_rate
+            sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
             feature_sizes = [
                 self.info.get_audio_num_frames(len(audio), sr)
                 for audio in audio_data
@@ -1043,7 +1036,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
         audio_token_id = tokenizer.vocab[tokenizer.audio_token]
 
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        audio_processor = self.info.get_feature_extractor()
+        audio_processor = self.info.get_feature_extractor(
+            **hf_processor_mm_kwargs)
 
         def get_image_replacement_phi4mm(item_idx: int):
             images = mm_items.get_items(
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 9b61c3634d841..73e8446e6dea7 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -459,17 +459,6 @@ def cat_with_pad(tensors, dim, padding_value=0):
 
 class Phi4MMProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        dynamic_hd: Optional[int] = None,
-        **kwargs: object,
-    ) -> ProcessorMixin:
-        if dynamic_hd is not None:
-            kwargs["dynamic_hd"] = dynamic_hd
-
-        return self.ctx.get_hf_processor(**kwargs)
-
     @property
     def image_tokens(self) -> list[str]:
         return [f"<|image_{i+1}|>" for i in range(100)]
@@ -487,8 +476,9 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         image_processor = processor.image_processor
         return image_processor.dynamic_hd
 
-    def get_feature_extractor(self) -> SequenceFeatureExtractor:
-        return self.get_hf_processor().audio_processor
+    def get_feature_extractor(self,
+                              **kwargs: object) -> SequenceFeatureExtractor:
+        return self.get_hf_processor(**kwargs).audio_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None, "image": None}
@@ -769,7 +759,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
-        sr = self.info.get_feature_extractor().sampling_rate
+        sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate
         if (audio_data := mm_data.get("audios", [])):
             mm_data['audios'] = [(data, sr) for data in audio_data]
 
@@ -816,7 +806,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
     ) -> Sequence[PromptUpdate]:
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
-        feature_extractor = self.info.get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor(
+            **hf_processor_mm_kwargs)
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         def get_image_replacement_phi4mm(item_idx: int):
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index c5a5c10d9509f..b9fed79c84cdd 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -132,50 +132,15 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config
 
-    def get_hf_processor(
-        self,
-        *,
-        sampling_rate: Optional[int] = None,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        fps: Optional[Union[float, list[float]]] = None,
-        **kwargs: object,
-    ) -> Qwen2_5OmniProcessor:
-        if fps is not None:
-            kwargs["fps"] = fps
-
-        # Monkey patch for Transformers v4.53
-        processor_class = Qwen2_5OmniProcessor
-        if processor_class.image_processor_class != "AutoImageProcessor":
-            processor_class.image_processor_class = "AutoImageProcessor"
-        if processor_class.video_processor_class != "AutoVideoProcessor":
-            processor_class.video_processor_class = "AutoVideoProcessor"
-
-        processor = self.ctx.get_hf_processor(
-            processor_class,
-            image_processor=self.get_image_processor(min_pixels=min_pixels,
-                                                     max_pixels=max_pixels,
-                                                     size=size,
-                                                     use_fast=kwargs.get(
-                                                         "use_fast", True)),
+    def get_hf_processor(self, **kwargs: object) -> Qwen2_5OmniProcessor:
+        return self.ctx.get_hf_processor(
+            Qwen2_5OmniProcessor,
+            use_fast=kwargs.pop("use_fast", True),
             **kwargs,
         )
-        if not hasattr(processor, "audio_token"):
-            processor.audio_token = "<|AUDIO|>"
-        if not hasattr(processor, "image_token"):
-            processor.image_token = "<|IMAGE|>"
-        if not hasattr(processor, "video_token"):
-            processor.video_token = "<|VIDEO|>"
-        return processor
 
-    def get_feature_extractor(
-        self,
-        *,
-        sampling_rate: Optional[int] = None,
-        **kwargs: object,
-    ):
-        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+    def get_feature_extractor(self, **kwargs: object):
+        hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8ae096536fdc5..c4c4650f569e1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -780,25 +780,10 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2_5_VLConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        fps: Optional[Union[float, list[float]]] = None,
-        **kwargs: object,
-    ) -> Qwen2_5_VLProcessor:
-        if fps is not None:
-            kwargs["fps"] = fps
-
+    def get_hf_processor(self, **kwargs: object) -> Qwen2_5_VLProcessor:
         return self.ctx.get_hf_processor(
             Qwen2_5_VLProcessor,
-            image_processor=self.get_image_processor(min_pixels=min_pixels,
-                                                     max_pixels=max_pixels,
-                                                     size=size,
-                                                     use_fast=kwargs.get(
-                                                         "use_fast", True)),
+            use_fast=kwargs.pop("use_fast", True),
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index d7fec30acd8d3..3ef55cd704cf0 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -86,22 +86,12 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2AudioConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-        **kwargs: object,
-    ) -> Qwen2AudioProcessor:
+    def get_hf_processor(self, **kwargs: object) -> Qwen2AudioProcessor:
         return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
 
-    def get_feature_extractor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> WhisperFeatureExtractor:
-        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+    def get_feature_extractor(self,
+                              **kwargs: object) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index ad63bb4af4e9d..4e8ea8e449133 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -69,8 +69,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.transformers_utils.processor import (
-    cached_image_processor_from_config)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -752,73 +750,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Qwen2VLConfig)
 
-    def get_hf_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> Qwen2VLProcessor:
+    def get_hf_processor(self, **kwargs: object) -> Qwen2VLProcessor:
         return self.ctx.get_hf_processor(
             Qwen2VLProcessor,
-            image_processor=self.get_image_processor(min_pixels=min_pixels,
-                                                     max_pixels=max_pixels,
-                                                     size=size,
-                                                     use_fast=kwargs.get(
-                                                         "use_fast", True)),
+            use_fast=kwargs.pop("use_fast", True),
             **kwargs,
         )
 
-    def _get_image_processor_kwargs(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ):
-        mm_config = self.ctx.model_config.get_multimodal_config()
-        if mm_config.mm_processor_kwargs:
-            kwargs.update(mm_config.mm_processor_kwargs)
-
-        if min_pixels is not None:
-            kwargs["min_pixels"] = min_pixels
-
-            if size is None:
-                size = {"shortest_edge": min_pixels}
-            else:
-                size["shortest_edge"] = min_pixels
-
-        if max_pixels is not None:
-            kwargs["max_pixels"] = max_pixels
-
-            if size is None:
-                size = {"longest_edge": max_pixels}
-            else:
-                size["longest_edge"] = max_pixels
-
-        if size is not None:
-            kwargs["size"] = size
-
-        return kwargs
-
-    def get_image_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> Qwen2VLImageProcessor:
-        kwargs["use_fast"] = kwargs.get("use_fast", True)
-        return cached_image_processor_from_config(
-            self.ctx.model_config,
-            **self._get_image_processor_kwargs(min_pixels=min_pixels,
-                                               max_pixels=max_pixels,
-                                               size=size,
-                                               **kwargs),
-        )
+    def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
@@ -1023,20 +963,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2VLMultiModalDataParser()
 
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
-        return self.info.ctx.call_hf_processor(
-            self.info.get_hf_processor(**mm_kwargs),
-            dict(text=prompt, **mm_data),
-            dict(**mm_kwargs, **tok_kwargs),
-        )
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 5ae5c0bc1d5dc..c76aabcd27ccb 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -7,9 +7,8 @@
 # Copyright (c) 2025 Skywork
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, TypeVar, Union
+from typing import Literal, Optional, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -232,7 +231,7 @@ def image_to_pixel_values_skyworkr1v(
     return pixel_values
 
 
-class BaseSkyworkR1VProcessor(ABC):
+class SkyworkR1VProcessor:
     """
     This model doesn't define its own HF processor,
     so we implement our own one here.
@@ -279,17 +278,18 @@ class BaseSkyworkR1VProcessor(ABC):
         self.use_thumbnail: bool = config.use_thumbnail
 
     @property
-    @abstractmethod
     def image_token_id(self) -> int:
-        raise NotImplementedError
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
-    @abstractmethod
     def get_image_repl(
         self,
         feature_size: int,
         num_patches: Optional[int],
     ) -> PromptUpdateDetails[str]:
-        raise NotImplementedError
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
     def resolve_min_max_num(
         self,
@@ -426,35 +426,15 @@ class BaseSkyworkR1VProcessor(ABC):
         }
 
 
-class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
+class SkyworkR1VProcessingInfo(BaseProcessingInfo):
 
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-
-class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
-
-    @abstractmethod
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> BaseSkyworkR1VProcessor:
-        raise NotImplementedError
+    def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
+        return self.ctx.init_processor(
+            SkyworkR1VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -464,7 +444,7 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[BaseSkyworkR1VProcessor],
+        processor: Optional[SkyworkR1VProcessor],
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -500,10 +480,8 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
         return largest_feature_pinpoint
 
 
-_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
-
-
-class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+class SkyworkR1VDummyInputsBuilder(
+        BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]):
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
@@ -527,7 +505,8 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         }
 
 
-class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
+class SkyworkR1VMultiModalProcessor(
+        BaseMultiModalProcessor[SkyworkR1VProcessingInfo]):
 
     def _call_hf_processor(
         self,
@@ -617,31 +596,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
         ]
 
 
-class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
-
-    def get_hf_processor(
-        self,
-        *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        **kwargs: object,
-    ) -> SkyworkR1VProcessor:
-        if min_dynamic_patch is not None:
-            kwargs["min_dynamic_patch"] = min_dynamic_patch
-        if max_dynamic_patch is not None:
-            kwargs["max_dynamic_patch"] = max_dynamic_patch
-        if dynamic_image_size is not None:
-            kwargs["dynamic_image_size"] = dynamic_image_size
-
-        return self.ctx.init_processor(
-            SkyworkR1VProcessor,
-            config=self.get_hf_config(),
-            tokenizer=self.get_tokenizer(),
-            **kwargs,
-        )
-
-
 @MULTIMODAL_REGISTRY.register_processor(
     SkyworkR1VMultiModalProcessor,
     info=SkyworkR1VProcessingInfo,
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index 0f22ba5b406ce..2adfad67152b3 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -19,15 +19,7 @@ from .idefics3 import Idefics3ProcessingInfo
 
 class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        max_image_size: Optional[dict[str, int]] = None,
-        **kwargs: object,
-    ) -> SmolVLMProcessor:
-        if max_image_size is not None:
-            kwargs["max_image_size"] = max_image_size
-
+    def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
         return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
 
     def _get_image_token(
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 979d789b330cf..70cf5e95a54e1 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -178,13 +178,11 @@ class TarsierProcessingInfo(BaseProcessingInfo):
         return get_vision_encoder_info(self.get_hf_config())
 
     def get_hf_processor(self, **kwargs: object) -> TarsierProcessor:
-        hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
-        # Patch for patch_size if needed (copied from vLLM LLaVA)
-        if hasattr(hf_processor,
-                   'patch_size') and hf_processor.patch_size is None:
-            patch_size = self.get_vision_encoder_info().get_patch_size()
-            hf_processor.patch_size = patch_size
-        return hf_processor
+        vision_info = self.get_vision_encoder_info()
+
+        kwargs.setdefault("patch_size", vision_info.get_patch_size())
+
+        return self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 8cd95605cdfae..e67548800c354 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -48,7 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import is_list_of
 
 from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
@@ -189,10 +188,6 @@ class MultiModalProcessingInfo(BaseProcessingInfo):
         image_tokens = mm_tokens["num_image_tokens"][0]
         return image_tokens
 
-    def get_hf_processor(self):
-        processor = cached_get_processor(self.ctx.model_config.model)
-        return processor
-
     def get_max_image_size(self):
         return 10_000, 10_000  # hardcode for arbitrary very large size
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index a4569ccd5a845..bef34c1be49fe 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -71,13 +71,7 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
 
 class UltravoxProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-        **kwargs: object,
-    ) -> ProcessorMixin:
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         config = self.ctx.model_config.hf_config
         hf_processor = self.ctx.get_hf_processor(**kwargs)
 
@@ -89,13 +83,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
 
         return hf_processor
 
-    def get_feature_extractor(
-        self,
-        *,
-        # Ignored in initialization
-        sampling_rate: Optional[int] = None,
-    ) -> WhisperFeatureExtractor:
-        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+    def get_feature_extractor(self,
+                              **kwargs: object) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(**kwargs)
         audio_processor = hf_processor.audio_processor  # type: ignore
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
@@ -156,7 +146,7 @@ class UltravoxMultiModalProcessor(
         audios = mm_data.pop("audios", [])
         assert isinstance(audios, list)
 
-        feature_extractor = self.info.get_feature_extractor()
+        feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
         mm_kwargs = dict(
             **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index d7bafb9ef84d9..ca02ecd828ba3 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -623,23 +623,22 @@ class WhisperProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> WhisperConfig:
         return self.ctx.get_hf_config(WhisperConfig)
 
-    def get_hf_processor(self,
-                         sampling_rate: Optional[int] = None
-                         ) -> WhisperProcessor:
-        # HACK: Transformers 4.53.0 has issue with whisper tokenizer to
+    def get_hf_processor(self, **kwargs: object) -> WhisperProcessor:
+        # HACK: Transformers 4.53.2 has issue with whisper tokenizer to
         # initialize processor. We use a monkeypatch to fix it here.
         # See: https://github.com/vllm-project/vllm/issues/20224
         processor_class = WhisperProcessor
         tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast")
         if processor_class.tokenizer_class != tokenizer_class:
             processor_class.tokenizer_class = tokenizer_class
-        return self.ctx.get_hf_processor(processor_class)
+        return self.ctx.get_hf_processor(processor_class, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": 1}
 
-    def get_feature_extractor(self) -> WhisperFeatureExtractor:
-        hf_processor = self.get_hf_processor()
+    def get_feature_extractor(self,
+                              **kwargs: object) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(**kwargs)
         feature_extractor = hf_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
@@ -702,7 +701,7 @@ class WhisperMultiModalProcessor(
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         if mm_data:
-            feature_extractor = self.info.get_feature_extractor()
+            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
             mm_data = dict(audio=mm_data.pop("audios"))
             mm_kwargs = dict(
                 **mm_kwargs,
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 70cd08263d372..a630d940b2578 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -4,9 +4,15 @@
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
+from transformers import (AutoFeatureExtractor, AutoImageProcessor,
+                          AutoProcessor)
+from transformers.feature_extraction_utils import FeatureExtractionMixin
+from transformers.image_processing_utils import BaseImageProcessor
 from transformers.processing_utils import ProcessorMixin
 from typing_extensions import TypeVar
 
+from vllm.utils import get_allowed_kwarg_only_overrides
+
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
@@ -33,23 +39,42 @@ class HashableList(list):
         return hash(tuple(self))
 
 
-def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
-    mm_config = model_config.get_multimodal_config()
-    base_kwargs = mm_config.mm_processor_kwargs
-    if base_kwargs is None:
-        base_kwargs = {}
+def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
+    if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
+        return AutoProcessor.from_pretrained
+    if hasattr(processor_cls, "from_pretrained"):
+        return processor_cls.from_pretrained
 
-    merged_kwargs = {**base_kwargs, **kwargs}
+    return processor_cls
+
+
+def _merge_mm_kwargs(
+    model_config: "ModelConfig",
+    processor_cls: Union[type, tuple[type, ...]],
+    /,
+    **kwargs,
+):
+    mm_config = model_config.get_multimodal_config()
+    merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
+
+    factory = _get_processor_factory_fn(processor_cls)
+    allowed_kwargs = get_allowed_kwarg_only_overrides(
+        factory,
+        merged_kwargs,
+        requires_kw_only=False,
+        allow_var_kwargs=True,
+    )
 
     # NOTE: Pythonic dict is not hashable and will raise unhashable type
     # error when calling `cached_get_processor`, therefore we need to
     # wrap it to a hashable dict.
-    for key, value in merged_kwargs.items():
+    for key, value in allowed_kwargs.items():
         if isinstance(value, dict):
-            merged_kwargs[key] = HashableDict(value)
+            allowed_kwargs[key] = HashableDict(value)
         if isinstance(value, list):
-            merged_kwargs[key] = HashableList(value)
-    return merged_kwargs
+            allowed_kwargs[key] = HashableList(value)
+
+    return allowed_kwargs
 
 
 def get_processor(
@@ -61,21 +86,29 @@ def get_processor(
     **kwargs: Any,
 ) -> _P:
     """Load a processor for the given model name via HuggingFace."""
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoProcessor
-
-    processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
-                         isinstance(processor_cls, tuple) else processor_cls)
+    if revision is None:
+        revision = "main"
 
     try:
-        processor = processor_factory.from_pretrained(
-            processor_name,
-            *args,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            **kwargs,
-        )
+        if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
+            processor = AutoProcessor.from_pretrained(
+                processor_name,
+                *args,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        elif issubclass(processor_cls, ProcessorMixin):
+            processor = processor_cls.from_pretrained(
+                processor_name,
+                *args,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        else:
+            # Processors that are standalone classes unrelated to HF
+            processor = processor_cls(*args, **kwargs)
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
         # currently being imported, suggest using the --trust-remote-code flag.
@@ -112,7 +145,7 @@ def cached_processor_from_config(
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
         processor_cls=processor_cls,  # type: ignore[arg-type]
-        **_merge_mm_kwargs(model_config, **kwargs),
+        **_merge_mm_kwargs(model_config, processor_cls, **kwargs),
     )
 
 
@@ -125,10 +158,6 @@ def get_feature_extractor(
 ):
     """Load an audio feature extractor for the given model name 
     via HuggingFace."""
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoFeatureExtractor
-    from transformers.feature_extraction_utils import FeatureExtractionMixin
     try:
         feature_extractor = AutoFeatureExtractor.from_pretrained(
             processor_name,
@@ -164,7 +193,7 @@ def cached_feature_extractor_from_config(
         model_config.model,
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
-        **_merge_mm_kwargs(model_config, **kwargs),
+        **_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs),
     )
 
 
@@ -176,11 +205,6 @@ def get_image_processor(
     **kwargs: Any,
 ):
     """Load an image processor for the given model name via HuggingFace."""
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoImageProcessor
-    from transformers.image_processing_utils import BaseImageProcessor
-
     try:
         processor = AutoImageProcessor.from_pretrained(
             processor_name,
@@ -217,5 +241,5 @@ def cached_image_processor_from_config(
         model_config.model,
         revision=model_config.revision,
         trust_remote_code=model_config.trust_remote_code,
-        **_merge_mm_kwargs(model_config, **kwargs),
+        **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
     )
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index ae978c855a8e5..a7f579b0c9c2d 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2010,49 +2010,6 @@ def supports_kw(
     return False
 
 
-def resolve_mm_processor_kwargs(
-    init_kwargs: Optional[Mapping[str, object]],
-    inference_kwargs: Optional[Mapping[str, object]],
-    callable: Callable[..., object],
-    *,
-    requires_kw_only: bool = True,
-    allow_var_kwargs: bool = False,
-) -> dict[str, Any]:
-    """Applies filtering to eliminate invalid mm_processor_kwargs, i.e.,
-    those who are not explicit keywords to the given callable (of one is
-    given; otherwise no filtering is done), then merges the kwarg dicts,
-    giving priority to inference_kwargs if there are any collisions.
-
-    In the case that no kwarg overrides are provided, returns an empty
-    dict so that it can still be kwarg expanded into the callable later on.
-
-    If allow_var_kwargs=True, allows for things that can be expanded into
-    kwargs as long as they aren't naming collision for var_kwargs or potential
-    positional arguments.
-    """
-    # Filter inference time multimodal processor kwargs provided
-    runtime_mm_kwargs = get_allowed_kwarg_only_overrides(
-        callable,
-        overrides=inference_kwargs,
-        requires_kw_only=requires_kw_only,
-        allow_var_kwargs=allow_var_kwargs,
-    )
-
-    # Filter init time multimodal processor kwargs provided
-    init_mm_kwargs = get_allowed_kwarg_only_overrides(
-        callable,
-        overrides=init_kwargs,
-        requires_kw_only=requires_kw_only,
-        allow_var_kwargs=allow_var_kwargs,
-    )
-
-    # Merge the final processor kwargs, prioritizing inference
-    # time values over the initialization time values.
-    mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs}
-
-    return mm_processor_kwargs
-
-
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
     overrides: Optional[Mapping[str, object]],

From e1a7fe4af5e9c287501c648e64956a08705af86a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= <mickael@mistral.ai>
Date: Fri, 1 Aug 2025 07:45:02 +0200
Subject: [PATCH 07/54] [BugFix] fix: aot passes kvcache dtype information
 (#19750)

Signed-off-by: Mickael Seznec <mickael@mistral.ai>
---
 vllm/v1/attention/backends/flash_attn.py | 25 ++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 4c2a6c6b985b2..3f9afa67aef70 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -99,6 +99,13 @@ class FlashAttentionBackend(AttentionBackend):
             raise ValueError(f"Unknown cache layout format {cache_layout}.")
         return stride_order
 
+    @staticmethod
+    def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype:
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            return torch.float8_e4m3fn
+        else:
+            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
+
 
 @dataclass
 class FlashAttentionMetadata:
@@ -161,6 +168,7 @@ class FlashAttentionMetadataBuilder(
             self.parallel_config)
         self.num_heads_kv = self.model_config.get_num_kv_heads(
             self.parallel_config)
+        self.kv_cache_dtype = kv_cache_spec.dtype
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
 
@@ -239,17 +247,24 @@ class FlashAttentionMetadataBuilder(
 
         def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
                      max_seq_len, causal):
+            cache_dtype = self.cache_config.cache_dtype
+            if cache_dtype.startswith("fp8"):
+                qkv_dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
+                    cache_dtype)
+            else:
+                qkv_dtype = self.kv_cache_dtype
             if aot_schedule:
                 return get_scheduler_metadata(
                     batch_size=batch_size,
                     max_seqlen_q=max_query_len,
                     max_seqlen_k=max_seq_len,
-                    cache_seqlens=seqlens,
                     num_heads_q=self.num_heads_q,
                     num_heads_kv=self.num_heads_kv,
                     headdim=self.headdim,
-                    page_size=self.block_size,
+                    cache_seqlens=seqlens,
+                    qkv_dtype=qkv_dtype,
                     cu_seqlens_q=cu_query_lens,
+                    page_size=self.block_size,
                     causal=causal,
                     window_size=self.aot_sliding_window,
                     num_splits=self.max_num_splits,
@@ -474,8 +489,10 @@ class FlashAttentionImpl(AttentionImpl):
             )
 
         if self.kv_cache_dtype.startswith("fp8"):
-            key_cache = key_cache.view(torch.float8_e4m3fn)
-            value_cache = value_cache.view(torch.float8_e4m3fn)
+            dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn(
+                self.kv_cache_dtype)
+            key_cache = key_cache.view(dtype)
+            value_cache = value_cache.view(dtype)
             num_tokens, num_heads, head_size = query.shape
             query, _ = ops.scaled_fp8_quant(
                 query.reshape(

From 0f46a780d4f53b8564a37370f9f068cdf4e69604 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 1 Aug 2025 01:45:15 -0400
Subject: [PATCH 08/54] [Model] [Quantization] Support quantization for Gemma3n
 (#21974)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 vllm/model_executor/models/gemma3n.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index a58b32793dbef..e16c03c8d3b57 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -46,6 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsQuant
 from .utils import (AutoWeightsLoader, extract_layer_index,
                     is_pp_missing_parameter, make_layers, maybe_prefix)
 
@@ -68,6 +69,7 @@ class Gemma3nAltUp(nn.Module):
         altup_num_inputs: int,
         altup_coef_clip: float,
         altup_active_idx: int,
+        quant_config: QuantizationConfig,
         prefix: str,
     ):
         super().__init__()
@@ -80,6 +82,7 @@ class Gemma3nAltUp(nn.Module):
             altup_num_inputs,
             altup_num_inputs,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.correction_coefs",
             return_bias=False,
         )
@@ -87,6 +90,7 @@ class Gemma3nAltUp(nn.Module):
             altup_num_inputs,
             altup_num_inputs**2,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.prediction_coefs",
             return_bias=False,
         )
@@ -94,6 +98,7 @@ class Gemma3nAltUp(nn.Module):
             hidden_size,
             altup_num_inputs,
             bias=False,
+            quant_config=quant_config,
             prefix=f"{prefix}.modality_router",
             return_bias=False,
         )
@@ -400,6 +405,7 @@ class Gemma3nDecoderLayer(nn.Module):
             altup_num_inputs=config.altup_num_inputs,
             altup_coef_clip=config.altup_coef_clip,
             altup_active_idx=config.altup_active_idx,
+            quant_config=quant_config,
             prefix=f"{prefix}.altup",
         )
         self.self_attn = Gemma3nAttention(
@@ -527,7 +533,7 @@ class Gemma3nDecoderLayer(nn.Module):
 
 
 @support_torch_compile
-class Gemma3nTextModel(nn.Module):
+class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -540,6 +546,7 @@ class Gemma3nTextModel(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
             prefix=f"{prefix}.embed_tokens",
         )
         self.embed_scale = torch.tensor(
@@ -549,6 +556,7 @@ class Gemma3nTextModel(nn.Module):
         self.embed_tokens_per_layer = VocabParallelEmbedding(
             config.vocab_size_per_layer_input,
             config.num_hidden_layers * config.hidden_size_per_layer_input,
+            quant_config=quant_config,
             prefix=f"{prefix}.per_layer_embed_tokens",
         )
         self.embed_scale_per_layer = torch.tensor(
@@ -582,7 +590,7 @@ class Gemma3nTextModel(nn.Module):
                 gather_output=True,
                 return_bias=False,
                 quant_config=quant_config,
-                prefix=f"{prefix}.{idx-1}.altup_projections",
+                prefix=f"{prefix}.altup_projections.{idx-1}",
             ) for idx in range(1, self.config.altup_num_inputs)
         ])
         self.altup_unembed_projections = nn.ModuleList([
@@ -593,7 +601,7 @@ class Gemma3nTextModel(nn.Module):
                 gather_output=True,
                 return_bias=False,
                 quant_config=quant_config,
-                prefix=f"{prefix}.{idx-1}.altup_unembed_projections",
+                prefix=f"{prefix}.altup_unembed_projections.{idx-1}",
             ) for idx in range(1, self.config.altup_num_inputs)
         ])
 
@@ -774,7 +782,7 @@ class Gemma3nModel(nn.Module):
                                    **kwargs)
 
 
-class Gemma3nForConditionalGeneration(nn.Module):
+class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From 61dcc280faf305778c0c44597e823f40063aaed6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Aug 2025 14:10:56 +0800
Subject: [PATCH 09/54] [Doc] Add Voxtral to Supported Models page (#22059)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index f5d9e3b22f2a6..56c77a1e5f118 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -713,6 +713,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
+| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
 
 ### Pooling Models
 

From 53d7c39271aeb0568afcae337396a972e1848586 Mon Sep 17 00:00:00 2001
From: Aviad Rossmann <aviadr@neureality.ai>
Date: Fri, 1 Aug 2025 09:23:18 +0300
Subject: [PATCH 10/54] Update sampling_metadata.py (#21937)

Signed-off-by: Aviad Rossmann <aviadr@neureality.ai>
---
 vllm/model_executor/sampling_metadata.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 56f0f0984bfa0..66bcf1c4bfe50 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -539,37 +539,37 @@ class SamplingTensors:
         temperatures_t = torch.tensor(
             temperatures,
             device="cpu",
-            dtype=dtype,
+            dtype=torch.float32,
             pin_memory=pin_memory,
         )
         top_ps_t = torch.tensor(
             top_ps,
             device="cpu",
-            dtype=dtype,
+            dtype=torch.float32,
             pin_memory=pin_memory,
         )
         min_ps_t = torch.tensor(
             min_ps,
             device="cpu",
-            dtype=dtype,
+            dtype=torch.float32,
             pin_memory=pin_memory,
         )
         presence_penalties_t = torch.tensor(
             presence_penalties,
             device="cpu",
-            dtype=dtype,
+            dtype=torch.float32,
             pin_memory=pin_memory,
         )
         frequency_penalties_t = torch.tensor(
             frequency_penalties,
             device="cpu",
-            dtype=dtype,
+            dtype=torch.float32,
             pin_memory=pin_memory,
         )
         repetition_penalties_t = torch.tensor(
             repetition_penalties,
             device="cpu",
-            dtype=dtype,
+            dtype=torch.float32,
             pin_memory=pin_memory,
         )
         top_ks_t = torch.tensor(

From 79731a79f09dc7bbe34dc8afbe8ef2242fb94a05 Mon Sep 17 00:00:00 2001
From: Hongsheng Liu <liuhongsheng4@huawei.com>
Date: Fri, 1 Aug 2025 15:01:22 +0800
Subject: [PATCH 11/54] [Doc] Fix a syntax error of example code in
 structured_outputs.md (#22045)

Signed-off-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
---
 docs/features/structured_outputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 4f737afa80f55..8a934d406f382 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -103,7 +103,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
                 "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
             }
         ],
-        "response_format": {
+        response_format={
             "type": "json_schema",
             "json_schema": {
                 "name": "car-description",

From b4e081cb150797b12039cc1232205dbb25ca0206 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 1 Aug 2025 15:03:56 +0800
Subject: [PATCH 12/54] [Bugfix] Disable multi-modal preprocessor cache for DP
 (#21896)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config.py                |  6 ++++++
 vllm/engine/arg_utils.py      | 12 ++++++++++++
 vllm/entrypoints/cli/serve.py |  5 +++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 9d5739ca11efd..93daab7d6ae97 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -871,6 +871,12 @@ class ModelConfig:
 
         return None
 
+    def set_disable_mm_preprocessor_cache(self, value: bool) -> None:
+        mm_config = self.get_multimodal_config()
+
+        self.disable_mm_preprocessor_cache = value
+        mm_config.disable_mm_preprocessor_cache = value
+
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c36c79c69317e..78272d983eaf5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1197,6 +1197,18 @@ class EngineArgs:
             enable_multimodal_encoder_data_parallel,
         )
 
+        supports_mm_preprocessor_cache = (self.data_parallel_size == 1
+                                          or data_parallel_external_lb)
+        if (not supports_mm_preprocessor_cache
+                and model_config.is_multimodal_model
+                and not model_config.disable_mm_preprocessor_cache):
+            logger.warning(
+                "Multi-modal preprocessor cache is not compatible "
+                "with data parallelism when there does not exist a "
+                "one-to-one correspondance between API process and "
+                "EngineCore process, so the cache will be disabled.")
+            model_config.set_disable_mm_preprocessor_cache(True)
+
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 7dcba2cccdb52..bdbe71b832f4f 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -167,8 +167,9 @@ def run_multi_api_server(args: argparse.Namespace):
 
         if model_config.is_multimodal_model and not (
                 orig_disable_mm_preprocessor_cache):
-            logger.warning("Multi-model preprocessor cache will be disabled "
-                           "for api_server_count > 1")
+            logger.warning(
+                "Multi-modal preprocessor cache is not compatible "
+                "with api_server_count > 1, so the cache will be disabled.")
 
     executor_class = Executor.get_class(vllm_config)
     log_stats = not engine_args.disable_log_stats

From e0f63e4a3509a9323339eee67c96ac3c93d15923 Mon Sep 17 00:00:00 2001
From: Zebing Lin <linzebing1995@gmail.com>
Date: Fri, 1 Aug 2025 03:23:29 -0400
Subject: [PATCH 13/54] [Core] Avoid repeated len(block_token_ids) check in
 hash_request_tokens (#21781)

Signed-off-by: linzebing <linzebing1995@gmail.com>
---
 vllm/v1/core/kv_cache_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 25520eb655111..eab1560b1a18c 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -567,12 +567,10 @@ def hash_request_tokens(hash_function: Any, block_size: int,
 
     ret = []
     parent_block_hash_value = None
-    for start in range(0, len(token_ids), block_size):
+    # Only full blocks will be hashed
+    for start in range(0, len(token_ids) - block_size + 1, block_size):
         end = start + block_size
         block_token_ids = token_ids[start:end]
-        # Do not hash the block if it is not full.
-        if len(block_token_ids) < block_size:
-            break
 
         if req_need_extra_keys:
             # MM and LoRA requests need extra keys for block-hash computation.

From 98df153abfcc443218aacfe61b3fd5abe2b88142 Mon Sep 17 00:00:00 2001
From: Sungyoon Jeong <157349761+n0gu-furiosa@users.noreply.github.com>
Date: Fri, 1 Aug 2025 16:54:17 +0900
Subject: [PATCH 14/54] [Frontend] Align tool_choice="required" behavior with
 OpenAI when tools is empty (#21052)

Signed-off-by: Sungyoon Jeong <sungyoon.jeong@furiosa.ai>
---
 vllm/entrypoints/openai/protocol.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b6b3bf3f530e3..d77aee345843c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -859,6 +859,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
                     'are supported.'
                 )
 
+            # if tool_choice is "required" but the "tools" list is empty,
+            # override the data to behave like "none" to align with
+            # OpenAI’s behavior.
+            if data["tool_choice"] == "required" and isinstance(
+                    data["tools"], list) and len(data["tools"]) == 0:
+                data["tool_choice"] = "none"
+                del data["tools"]
+                return data
+
             # ensure that if "tool_choice" is specified as an object,
             # it matches a valid tool
             correct_usage_message = 'Correct usage: `{"type": "function",' \

From da31f6ad3dacea8579adfb36d64d28759dc5c095 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 1 Aug 2025 01:26:24 -0700
Subject: [PATCH 15/54] Revert precompile wheel changes (#22055)

---
 docker/Dockerfile     |  27 +++----
 requirements/test.txt |  24 ++----
 setup.py              | 182 ++++++++++++++++++++----------------------
 vllm/envs.py          |  11 +--
 4 files changed, 107 insertions(+), 137 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 413151b3edb00..0d6afca74e867 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -206,7 +206,16 @@ ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 
 # Flag to control whether to use pre-built vLLM wheels
-ARG VLLM_USE_PRECOMPILED=""
+ARG VLLM_USE_PRECOMPILED
+# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
+ENV VLLM_USE_PRECOMPILED=""
+RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
+        export VLLM_USE_PRECOMPILED=1 && \
+        echo "Using precompiled wheels"; \
+    else \
+        unset VLLM_USE_PRECOMPILED && \
+        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
+    fi
 
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -223,8 +232,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
-        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
-        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
         && sccache --show-stats \
         && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
@@ -238,22 +245,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         # Clean any existing CMake artifacts
         rm -rf .deps && \
         mkdir -p .deps && \
-        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
-        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
-# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
-RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
-        echo "Cleaning up extra wheels in dist/..." && \
-        # Identify the most recent manylinux1_x86_64 wheel
-        KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
-        if [ -n "$KEEP_WHEEL" ]; then \
-            echo "Keeping wheel: $KEEP_WHEEL"; \
-            find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
-        fi; \
-    fi
-
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
@@ -369,7 +363,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     fi
 
 # Install vllm wheel first, so that torch etc will be installed.
-# !bang
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system dist/*.whl --verbose \
diff --git a/requirements/test.txt b/requirements/test.txt
index 4aaca2afea266..d45048aae5809 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -22,7 +22,9 @@ aiohttp==3.10.11
 aiohttp-cors==0.8.1
     # via ray
 aiosignal==1.3.1
-    # via aiohttp
+    # via
+    #   aiohttp
+    #   ray
 albucore==0.0.16
     # via terratorch
 albumentations==1.4.6
@@ -137,7 +139,7 @@ contourpy==1.3.0
     # via matplotlib
 cramjam==2.9.0
     # via fastparquet
-cupy-cuda12x==13.5.1
+cupy-cuda12x==13.3.0
     # via ray
 cycler==0.12.1
     # via matplotlib
@@ -224,6 +226,7 @@ frozenlist==1.5.0
     # via
     #   aiohttp
     #   aiosignal
+    #   ray
 fsspec==2024.9.0
     # via
     #   datasets
@@ -600,18 +603,10 @@ opencv-python-headless==4.11.0.86
 opentelemetry-api==1.35.0
     # via
     #   mlflow-skinny
-    #   opentelemetry-exporter-prometheus
     #   opentelemetry-sdk
     #   opentelemetry-semantic-conventions
-opentelemetry-exporter-prometheus==0.56b0
-    # via ray
-opentelemetry-proto==1.36.0
-    # via ray
 opentelemetry-sdk==1.35.0
-    # via
-    #   mlflow-skinny
-    #   opentelemetry-exporter-prometheus
-    #   ray
+    # via mlflow-skinny
 opentelemetry-semantic-conventions==0.56b0
     # via opentelemetry-sdk
 packaging==24.2
@@ -702,9 +697,7 @@ pqdm==0.2.0
 pretrainedmodels==0.7.4
     # via segmentation-models-pytorch
 prometheus-client==0.22.0
-    # via
-    #   opentelemetry-exporter-prometheus
-    #   ray
+    # via ray
 propcache==0.2.0
     # via yarl
 proto-plus==1.26.1
@@ -714,7 +707,6 @@ protobuf==5.28.3
     #   google-api-core
     #   googleapis-common-protos
     #   mlflow-skinny
-    #   opentelemetry-proto
     #   proto-plus
     #   ray
     #   tensorboardx
@@ -862,7 +854,7 @@ rasterio==1.4.3
     #   rioxarray
     #   terratorch
     #   torchgeo
-ray==2.48.0
+ray==2.43.0
     # via -r requirements/test.in
 redis==5.2.0
     # via tensorizer
diff --git a/setup.py b/setup.py
index bfa195d4395f0..64cfbb8db962b 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,6 @@ import json
 import logging
 import os
 import re
-import shutil
 import subprocess
 import sys
 from pathlib import Path
@@ -282,69 +281,10 @@ class cmake_build_ext(build_ext):
             self.copy_file(file, dst_file)
 
 
-class precompiled_wheel_utils:
+class repackage_wheel(build_ext):
     """Extracts libraries and other files from an existing wheel."""
 
-    @staticmethod
-    def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
-        import tempfile
-        import zipfile
-
-        temp_dir = None
-        try:
-            if not os.path.isfile(wheel_url_or_path):
-                wheel_filename = wheel_url_or_path.split("/")[-1]
-                temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
-                wheel_path = os.path.join(temp_dir, wheel_filename)
-                print(f"Downloading wheel from {wheel_url_or_path} "
-                      f"to {wheel_path}")
-                from urllib.request import urlretrieve
-                urlretrieve(wheel_url_or_path, filename=wheel_path)
-            else:
-                wheel_path = wheel_url_or_path
-                print(f"Using existing wheel at {wheel_path}")
-
-            package_data_patch = {}
-
-            with zipfile.ZipFile(wheel_path) as wheel:
-                files_to_copy = [
-                    "vllm/_C.abi3.so",
-                    "vllm/_moe_C.abi3.so",
-                    "vllm/_flashmla_C.abi3.so",
-                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
-                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
-                    "vllm/cumem_allocator.abi3.so",
-                ]
-
-                compiled_regex = re.compile(
-                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
-                file_members = list(
-                    filter(lambda x: x.filename in files_to_copy,
-                           wheel.filelist))
-                file_members += list(
-                    filter(lambda x: compiled_regex.match(x.filename),
-                           wheel.filelist))
-
-                for file in file_members:
-                    print(f"[extract] {file.filename}")
-                    target_path = os.path.join(".", file.filename)
-                    os.makedirs(os.path.dirname(target_path), exist_ok=True)
-                    with wheel.open(file.filename) as src, open(
-                            target_path, "wb") as dst:
-                        shutil.copyfileobj(src, dst)
-
-                    pkg = os.path.dirname(file.filename).replace("/", ".")
-                    package_data_patch.setdefault(pkg, []).append(
-                        os.path.basename(file.filename))
-
-            return package_data_patch
-        finally:
-            if temp_dir is not None:
-                print(f"Removing temporary directory {temp_dir}")
-                shutil.rmtree(temp_dir)
-
-    @staticmethod
-    def get_base_commit_in_main_branch() -> str:
+    def get_base_commit_in_main_branch(self) -> str:
         # Force to use the nightly wheel. This is mainly used for CI testing.
         if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
             return "nightly"
@@ -357,10 +297,6 @@ class precompiled_wheel_utils:
             ]).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
 
-            # In Docker build context, .git may be immutable or missing.
-            if envs.VLLM_DOCKER_BUILD_CONTEXT:
-                return upstream_main_commit
-
             # Check if the upstream_main_commit exists in the local repo
             try:
                 subprocess.check_output(
@@ -393,15 +329,92 @@ class precompiled_wheel_utils:
                 "wheel may not be compatible with your dev branch: %s", err)
             return "nightly"
 
+    def run(self) -> None:
+        assert _is_cuda(
+        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
+        if wheel_location is None:
+            base_commit = self.get_base_commit_in_main_branch()
+            wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+            # Fallback to nightly wheel if latest commit wheel is unavailable,
+            # in this rare case, the nightly release CI hasn't finished on main.
+            if not is_url_available(wheel_location):
+                wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+
+        import zipfile
+
+        if os.path.isfile(wheel_location):
+            wheel_path = wheel_location
+            print(f"Using existing wheel={wheel_path}")
+        else:
+            # Download the wheel from a given URL, assume
+            # the filename is the last part of the URL
+            wheel_filename = wheel_location.split("/")[-1]
+
+            import tempfile
+
+            # create a temporary directory to store the wheel
+            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
+            wheel_path = os.path.join(temp_dir, wheel_filename)
+
+            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
+
+            from urllib.request import urlretrieve
+
+            try:
+                urlretrieve(wheel_location, filename=wheel_path)
+            except Exception as e:
+                from setuptools.errors import SetupError
+
+                raise SetupError(
+                    f"Failed to get vLLM wheel from {wheel_location}") from e
+
+        with zipfile.ZipFile(wheel_path) as wheel:
+            files_to_copy = [
+                "vllm/_C.abi3.so",
+                "vllm/_moe_C.abi3.so",
+                "vllm/_flashmla_C.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
+                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
+                "vllm/cumem_allocator.abi3.so",
+                # "vllm/_version.py", # not available in nightly wheels yet
+            ]
+
+            file_members = list(
+                filter(lambda x: x.filename in files_to_copy, wheel.filelist))
+
+            # vllm_flash_attn python code:
+            # Regex from
+            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
+            compiled_regex = re.compile(
+                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+            file_members += list(
+                filter(lambda x: compiled_regex.match(x.filename),
+                       wheel.filelist))
+
+            for file in file_members:
+                print(f"Extracting and including {file.filename} "
+                      "from existing wheel")
+                package_name = os.path.dirname(file.filename).replace("/", ".")
+                file_name = os.path.basename(file.filename)
+
+                if package_name not in package_data:
+                    package_data[package_name] = []
+
+                wheel.extract(file)
+                if file_name.endswith(".py"):
+                    # python files shouldn't be added to package_data
+                    continue
+
+                package_data[package_name].append(file_name)
+
 
 def _no_device() -> bool:
     return VLLM_TARGET_DEVICE == "empty"
 
 
 def _is_cuda() -> bool:
-    # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
-    if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
-        return True
     has_cuda = torch.version.cuda is not None
     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
             and not (_is_neuron() or _is_tpu()))
@@ -626,37 +639,16 @@ package_data = {
     ]
 }
 
-# If using precompiled, extract and patch package_data (in advance of setup)
-if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-    wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
-    if wheel_location is not None:
-        wheel_url = wheel_location
-    else:
-        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-        from urllib.request import urlopen
-        try:
-            with urlopen(wheel_url) as resp:
-                if resp.status != 200:
-                    wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-        except Exception as e:
-            print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
-        wheel_url)
-    for pkg, files in patch.items():
-        package_data.setdefault(pkg, []).extend(files)
-
 if _no_device():
     ext_modules = []
 
-if not ext_modules or envs.VLLM_USE_PRECOMPILED:
-    # Disable build_ext when using precompiled wheel
+if not ext_modules:
     cmdclass = {}
 else:
-    cmdclass = {"build_ext": cmake_build_ext}
+    cmdclass = {
+        "build_ext":
+        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+    }
 
 setup(
     # static metadata should rather go in pyproject.toml
diff --git a/vllm/envs.py b/vllm/envs.py
index 19bc9156b2586..7553eccf16ea9 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -68,7 +68,6 @@ if TYPE_CHECKING:
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
-    VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_NO_DEPRECATION_WARNING: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -228,14 +227,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # If set, vllm will use precompiled binaries (*.so)
     "VLLM_USE_PRECOMPILED":
-    lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
-    ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
-
-    # Used to mark that setup.py is running in a Docker build context,
-    # in order to force the use of precompiled binaries.
-    "VLLM_DOCKER_BUILD_CONTEXT":
-    lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
-    ("1", "true"),
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
+        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
 
     # Whether to force using nightly wheel in python build.
     # This is used for testing the nightly wheel in python build.

From 27a145e8931582fc74c1f46e0e4630c610b96160 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 1 Aug 2025 01:35:49 -0700
Subject: [PATCH 16/54] [Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 examples/offline_inference/vision_language.py | 298 ++++++++++--------
 .../vision_language_multi_image.py            | 215 +++++++------
 2 files changed, 286 insertions(+), 227 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 0edcd0407747c..a75b8e2b047d8 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# SmolVLM2-2.2B-Instruct
-def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=2,
-        enforce_eager=True,
-        mm_processor_kwargs={
-            "max_image_size": {"longest_edge": 384},
-        },
-        limit_mm_per_prompt={modality: 1},
-    )
-    prompts = [
-        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
-        for question in questions
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 # Intern-S1
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "internlm/Intern-S1"
@@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# Nemontron_VL
-def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
-    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    assert modality == "image"
-    placeholder = "<image>"
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    messages = [
-        [{"role": "user", "content": f"{placeholder}\n{question}"}]
-        for question in questions
-    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    # Stop tokens for InternVL
-    # models variants may have different stop tokens
-    # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
-    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-        stop_token_ids=stop_token_ids,
-    )
-
-
 # Keye-VL
 def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
     model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        gpu_memory_utilization=0.4,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+    stop_token_ids = None
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # LLaVA-1.5
 def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -857,41 +828,6 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-
-    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=4,
-        tensor_parallel_size=8,
-        gpu_memory_utilization=0.4,
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
-            }
-        ]
-        for question in questions
-    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, add_generation_prompt=True, tokenize=False
-    )
-    stop_token_ids = None
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-        stop_token_ids=stop_token_ids,
-    )
-
-
 # Molmo
 def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -917,6 +853,44 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Nemontron_VL
+def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    assert modality == "image"
+    placeholder = "<image>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # NVLM-D
 def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1274,6 +1248,94 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
     )
 
 
+# SkyworkR1V
+def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "Skywork/Skywork-R1V-38B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    # Stop tokens for SkyworkR1V
+    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
+    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+# SmolVLM2-2.2B-Instruct
+def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        mm_processor_kwargs={
+            "max_image_size": {"longest_edge": 384},
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [
+        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# Step3
+def run_step3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "stepfun-ai/step3-fp8"
+
+    # NOTE: Below are verified configurations for step3-fp8
+    # on 8xH100 GPUs.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        gpu_memory_utilization=0.85,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={modality: 1},
+        reasoning_parser="step3",
+    )
+
+    prompts = [
+        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
+        f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # omni-research/Tarsier-7b
 def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1324,39 +1386,6 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# SkyworkR1V
-def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-
-    model_name = "Skywork/Skywork-R1V-38B"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        trust_remote_code=True,
-        max_model_len=4096,
-        limit_mm_per_prompt={modality: 1},
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    messages = [
-        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
-    ]
-    prompts = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    # Stop tokens for SkyworkR1V
-    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
-    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-        stop_token_ids=stop_token_ids,
-    )
-
-
 model_example_map = {
     "aria": run_aria,
     "aya_vision": run_aya_vision,
@@ -1373,9 +1402,9 @@ model_example_map = {
     "idefics3": run_idefics3,
     "interns1": run_interns1,
     "internvl_chat": run_internvl,
-    "nemotron_vl": run_nemotron_vl,
     "keye_vl": run_keye_vl,
     "kimi_vl": run_kimi_vl,
+    "llama4": run_llama4,
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
@@ -1385,8 +1414,8 @@ model_example_map = {
     "minicpmv": run_minicpmv,
     "mistral3": run_mistral3,
     "mllama": run_mllama,
-    "llama4": run_llama4,
     "molmo": run_molmo,
+    "nemotron_vl": run_nemotron_vl,
     "NVLM_D": run_nvlm_d,
     "ovis": run_ovis,
     "paligemma": run_paligemma,
@@ -1401,6 +1430,7 @@ model_example_map = {
     "qwen2_5_omni": run_qwen2_5_omni,
     "skywork_chat": run_skyworkr1v,
     "smolvlm": run_smolvlm,
+    "step3": run_step3,
     "tarsier": run_tarsier,
     "tarsier2": run_tarsier2,
 }
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index dd50f3639709e..1ab405fa14f3a 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -197,6 +197,53 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_hyperclovax_seed_vision(
+    question: str, image_urls: list[str]
+) -> ModelRequestData:
+    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=16384,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    message = {"role": "user", "content": list()}
+    for _image_url in image_urls:
+        message["content"].append(
+            {
+                "type": "image",
+                "image": _image_url,
+                "ocr": "",
+                "lens_keywords": "",
+                "lens_local_keywords": "",
+            }
+        )
+    message["content"].append(
+        {
+            "type": "text",
+            "text": question,
+        }
+    )
+
+    prompt = tokenizer.apply_chat_template(
+        [
+            message,
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
@@ -225,34 +272,6 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-
-    # The configuration below has been confirmed to launch on a single L40 GPU.
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
-        enforce_eager=True,
-        limit_mm_per_prompt={"image": len(image_urls)},
-        mm_processor_kwargs={
-            "max_image_size": {"longest_edge": 384},
-        },
-    )
-
-    placeholders = "\n".join(
-        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
-    )
-    prompt = (
-        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
-    )
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
-    )
-
-
 def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "internlm/Intern-S1"
 
@@ -316,49 +335,36 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_hyperclovax_seed_vision(
-    question: str, image_urls: list[str]
-) -> ModelRequestData:
-    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=16384,
+        max_model_len=131072,
+        tensor_parallel_size=8,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    message = {"role": "user", "content": list()}
-    for _image_url in image_urls:
-        message["content"].append(
-            {
-                "type": "image",
-                "image": _image_url,
-                "ocr": "",
-                "lens_keywords": "",
-                "lens_local_keywords": "",
-            }
-        )
-    message["content"].append(
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
         {
-            "type": "text",
-            "text": question,
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
         }
-    )
+    ]
 
-    prompt = tokenizer.apply_chat_template(
-        [
-            message,
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
     )
 
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=None,
         image_data=[fetch_image(url) for url in image_urls],
     )
 
@@ -463,40 +469,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
     )
 
 
-def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-
-    engine_args = EngineArgs(
-        model=model_name,
-        max_model_len=131072,
-        tensor_parallel_size=8,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
-    placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                *placeholders,
-                {"type": "text", "text": question},
-            ],
-        }
-    ]
-
-    processor = AutoProcessor.from_pretrained(model_name)
-
-    prompt = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
-    )
-
-
 def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "Kwai-Keye/Keye-VL-8B-Preview"
 
@@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={
+            "max_image_size": {"longest_edge": 384},
+        },
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    prompt = (
+        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "stepfun-ai/step3-fp8"
+
+    # NOTE: Below are verified configurations for step3-fp8
+    # on 8xH100 GPUs.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_batched_tokens=4096,
+        gpu_memory_utilization=0.85,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        reasoning_parser="step3",
+    )
+
+    prompt = (
+        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
+        f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
+        ">assistant\n<think>\n"
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "omni-research/Tarsier-7b"
 
@@ -1006,16 +1034,16 @@ model_example_map = {
     "deepseek_vl_v2": load_deepseek_vl2,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
+    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "idefics3": load_idefics3,
     "interns1": load_interns1,
     "internvl_chat": load_internvl,
-    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
     "keye_vl": load_keye_vl,
     "kimi_vl": load_kimi_vl,
+    "llama4": load_llama4,
     "llava": load_llava,
     "llava-next": load_llava_next,
     "llava-onevision": load_llava_onevision,
-    "llama4": load_llama4,
     "mistral3": load_mistral3,
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
@@ -1028,6 +1056,7 @@ model_example_map = {
     "qwen2_vl": load_qwen2_vl,
     "qwen2_5_vl": load_qwen2_5_vl,
     "smolvlm": load_smolvlm,
+    "step3": load_step3,
     "tarsier": load_tarsier,
     "tarsier2": load_tarsier2,
 }

From e6680f9e25a433bcd754181705e72034ce6c470c Mon Sep 17 00:00:00 2001
From: wuhang <wuhang6@huawei.com>
Date: Fri, 1 Aug 2025 17:04:16 +0800
Subject: [PATCH 17/54] [Bugfix] Add log prefix in non-dp mode engine core
 (#21889)

Signed-off-by: wuhang <wuhang6@huawei.com>
---
 vllm/entrypoints/cli/serve.py           | 11 +----
 vllm/entrypoints/openai/api_server.py   | 12 ++----
 vllm/executor/multiproc_worker_utils.py | 42 ++-----------------
 vllm/utils/__init__.py                  | 55 ++++++++++++++++++++++++-
 vllm/v1/engine/core.py                  | 22 ++--------
 vllm/v1/executor/multiproc_executor.py  | 14 +++----
 6 files changed, 75 insertions(+), 81 deletions(-)

diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index bdbe71b832f4f..0305354a66e85 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -2,9 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
-import os
 import signal
-import sys
 from typing import Optional
 
 import uvloop
@@ -18,10 +16,9 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
 from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
                                     show_filtered_argument_or_group_from_help)
-from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, get_tcp_uri
+from vllm.utils import FlexibleArgumentParser, decorate_logs, get_tcp_uri
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor.abstract import Executor
@@ -229,11 +226,7 @@ def run_api_server_worker_proc(listen_address,
     """Entrypoint for individual API server worker processes."""
 
     # Add process-specific prefix to stdout and stderr.
-    from multiprocessing import current_process
-    process_name = current_process().name
-    pid = os.getpid()
-    _add_prefix(sys.stdout, process_name, pid)
-    _add_prefix(sys.stderr, process_name, pid)
+    decorate_logs()
 
     uvloop.run(
         run_server_worker(listen_address, sock, args, client_config,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 05d9a69a65f83..26db1357da4d0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -11,7 +11,6 @@ import multiprocessing
 import os
 import signal
 import socket
-import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -95,15 +94,15 @@ from vllm.entrypoints.openai.serving_transcription import (
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
                                     log_non_default_args, with_cancellation)
-from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
-                        is_valid_ipv6_address, set_process_title, set_ulimit)
+from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
+                        get_open_zmq_ipc_path, is_valid_ipv6_address,
+                        set_process_title, set_ulimit)
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -1808,10 +1807,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     """Run a single-worker API server."""
 
     # Add process-specific prefix to stdout and stderr.
-    process_name = "APIServer"
-    pid = os.getpid()
-    _add_prefix(sys.stdout, process_name, pid)
-    _add_prefix(sys.stderr, process_name, pid)
+    decorate_logs("APIServer")
 
     listen_address, sock = setup_server(args)
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index a6c172beff7bb..48b3479ed7997 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -3,21 +3,20 @@
 
 import asyncio
 import os
-import sys
 import threading
 import uuid
 from dataclasses import dataclass
 from multiprocessing import Queue
 from multiprocessing.connection import wait
 from multiprocessing.process import BaseProcess
-from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
-                    TypeVar, Union)
+from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union
 
 import torch
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import _maybe_force_spawn, get_mp_context, run_method
+from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context,
+                        run_method)
 
 logger = init_logger(__name__)
 
@@ -25,10 +24,6 @@ T = TypeVar('T')
 
 _TERMINATE = "TERMINATE"  # sentinel
 
-# ANSI color codes
-CYAN = '\033[1;36m'
-RESET = '\033[0;0m'
-
 JOIN_TIMEOUT_S = 2
 
 
@@ -213,9 +208,7 @@ def _run_worker_process(
 
     # Add process-specific prefix to stdout and stderr
     process_name = get_mp_context().current_process().name
-    pid = os.getpid()
-    _add_prefix(sys.stdout, process_name, pid)
-    _add_prefix(sys.stderr, process_name, pid)
+    decorate_logs(process_name)
 
     # Initialize worker
     worker = worker_factory(vllm_config, rank)
@@ -260,33 +253,6 @@ def _run_worker_process(
     logger.info("Worker exiting")
 
 
-def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
-    """Prepend each output line with process-specific prefix"""
-
-    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
-    file_write = file.write
-
-    def write_with_prefix(s: str):
-        if not s:
-            return
-        if file.start_new_line:  # type: ignore[attr-defined]
-            file_write(prefix)
-        idx = 0
-        while (next_idx := s.find('\n', idx)) != -1:
-            next_idx += 1
-            file_write(s[idx:next_idx])
-            if next_idx == len(s):
-                file.start_new_line = True  # type: ignore[attr-defined]
-                return
-            file_write(prefix)
-            idx = next_idx
-        file_write(s[idx:])
-        file.start_new_line = False  # type: ignore[attr-defined]
-
-    file.start_new_line = True  # type: ignore[attr-defined]
-    file.write = write_with_prefix  # type: ignore[method-assign]
-
-
 def set_multiprocessing_worker_envs(parallel_config):
     """ Set up environment variables that should be used when there are workers
     in a multiprocessing environment. This should be called by the parent 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index a7f579b0c9c2d..d5d8d9dad73a8 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -47,7 +47,7 @@ from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Tuple, TypeVar, Union, cast, overload)
+                    Optional, TextIO, Tuple, TypeVar, Union, cast, overload)
 from urllib.parse import urlparse
 from uuid import uuid4
 
@@ -167,6 +167,10 @@ GB_bytes = 1_000_000_000
 GiB_bytes = 1 << 30
 """The number of bytes in one gibibyte (GiB)."""
 
+# ANSI color codes
+CYAN = '\033[1;36m'
+RESET = '\033[0;0m'
+
 STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.half,
     "bfloat16": torch.bfloat16,
@@ -3258,3 +3262,52 @@ def set_process_title(name: str,
     else:
         name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}"
     setproctitle.setproctitle(name)
+
+
+def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
+    """Prepend each output line with process-specific prefix"""
+
+    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    file_write = file.write
+
+    def write_with_prefix(s: str):
+        if not s:
+            return
+        if file.start_new_line:  # type: ignore[attr-defined]
+            file_write(prefix)
+        idx = 0
+        while (next_idx := s.find('\n', idx)) != -1:
+            next_idx += 1
+            file_write(s[idx:next_idx])
+            if next_idx == len(s):
+                file.start_new_line = True  # type: ignore[attr-defined]
+                return
+            file_write(prefix)
+            idx = next_idx
+        file_write(s[idx:])
+        file.start_new_line = False  # type: ignore[attr-defined]
+
+    file.start_new_line = True  # type: ignore[attr-defined]
+    file.write = write_with_prefix  # type: ignore[method-assign]
+
+
+def decorate_logs(process_name: Optional[str] = None) -> None:
+    """
+    Adds a process-specific prefix to each line of output written to stdout and
+    stderr.
+
+    This function is intended to be called before initializing the api_server,
+    engine_core, or worker classes, so that all subsequent output from the
+    process is prefixed with the process name and PID. This helps distinguish
+    log output from different processes in multi-process environments.
+
+    Args:
+        process_name: Optional; the name of the process to use in the prefix.
+            If not provided, the current process name from the multiprocessing
+            context is used.
+    """
+    if process_name is None:
+        process_name = get_mp_context().current_process().name
+    pid = os.getpid()
+    _add_prefix(sys.stdout, process_name, pid)
+    _add_prefix(sys.stderr, process_name, pid)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f9a6315df8af8..6ae5736df98b8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -3,7 +3,6 @@
 import os
 import queue
 import signal
-import sys
 import threading
 import time
 from collections import deque
@@ -19,15 +18,14 @@ import zmq
 
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
-from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (make_zmq_socket, resolve_obj_by_qualname,
-                        set_process_title)
+from vllm.utils import (decorate_logs, make_zmq_socket,
+                        resolve_obj_by_qualname, set_process_title)
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -649,12 +647,14 @@ class EngineCoreProc(EngineCore):
                 "vllm_config"].parallel_config
             if parallel_config.data_parallel_size > 1 or dp_rank > 0:
                 set_process_title("DPEngineCore", str(dp_rank))
+                decorate_logs()
                 # Set data parallel rank for this engine process.
                 parallel_config.data_parallel_rank = dp_rank
                 parallel_config.data_parallel_rank_local = local_dp_rank
                 engine_core = DPEngineCoreProc(*args, **kwargs)
             else:
                 set_process_title("EngineCore")
+                decorate_logs()
                 engine_core = EngineCoreProc(*args, **kwargs)
 
             engine_core.run_busy_loop()
@@ -905,8 +905,6 @@ class DPEngineCoreProc(EngineCoreProc):
         log_stats: bool,
         client_handshake_address: Optional[str] = None,
     ):
-        self._decorate_logs()
-
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
         self.counter = 0
@@ -919,15 +917,6 @@ class DPEngineCoreProc(EngineCoreProc):
                          executor_class, log_stats, client_handshake_address,
                          dp_rank)
 
-    def _decorate_logs(self):
-        # Add process-specific prefix to stdout and stderr before
-        # we initialize the engine.
-        from multiprocessing import current_process
-        process_name = current_process().name
-        pid = os.getpid()
-        _add_prefix(sys.stdout, process_name, pid)
-        _add_prefix(sys.stderr, process_name, pid)
-
     def _init_data_parallel(self, vllm_config: VllmConfig):
 
         # Configure GPUs and stateless process group for data parallel.
@@ -1149,9 +1138,6 @@ class DPEngineCoreActor(DPEngineCoreProc):
                 f"{(local_dp_rank + 1) * world_size}) "
                 f"base value: \"{os.getenv(device_control_env_var)}\"") from e
 
-    def _decorate_logs(self):
-        pass
-
     @contextmanager
     def _perform_handshakes(self, handshake_address: str, identity: bytes,
                             local_client: bool, vllm_config: VllmConfig,
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 8270385053852..d90051c3224fd 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -4,7 +4,6 @@ import multiprocessing
 import os
 import pickle
 import signal
-import sys
 import threading
 import time
 import traceback
@@ -28,10 +27,11 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.executor.multiproc_worker_utils import (
-    _add_prefix, set_multiprocessing_worker_envs)
+    set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
-from vllm.utils import (get_distributed_init_method, get_loopback_ip,
-                        get_mp_context, get_open_port, set_process_title)
+from vllm.utils import (decorate_logs, get_distributed_init_method,
+                        get_loopback_ip, get_mp_context, get_open_port,
+                        set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -382,11 +382,11 @@ class WorkerProc:
         pp_str = f"PP{rank // tp_size}" if pp_size > 1 else ""
         tp_str = f"TP{rank % tp_size}" if tp_size > 1 else ""
         suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}"
+        process_name = "VllmWorker"
         if suffix:
             set_process_title(suffix, append=True)
-        pid = os.getpid()
-        _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
-        _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
+            process_name = f"{process_name} {suffix}"
+        decorate_logs(process_name)
 
         # Initialize MessageQueue for receiving SchedulerOutput
         self.rpc_broadcast_mq = MessageQueue.create_from_handle(

From 0f81b310db013ec9fbc1deb9de97bd9b2a9af62f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 1 Aug 2025 02:11:40 -0700
Subject: [PATCH 18/54] [Misc] Remove upper bound in openai package version
 (#22060)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index d29b3e59d35b2..6b57a3d2f1d0d 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -13,7 +13,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
+openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing

From 49314869887e169be080201ab8bcda14e745c080 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Fri, 1 Aug 2025 17:11:56 +0800
Subject: [PATCH 19/54] [Doc] Added warning of speculating with draft model
 (#22047)

Signed-off-by: Dilute-l <dilu2333@163.com>
Co-authored-by: Dilute-l <dilu2333@163.com>
---
 docs/features/spec_decode.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index be4b91feda7aa..89d5b489e1888 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -15,6 +15,10 @@ Speculative decoding is a technique which improves inter-token latency in memory
 
 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
 
+!!! warning
+    In vllm v0.10.0, speculative decoding with a draft model is not supported.
+    If you use the following code, you will get a `NotImplementedError`.
+
 ??? code
 
     ```python

From 28b18cc741e596ea6f9981b8365c4819523fc24b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 1 Aug 2025 19:09:54 +0800
Subject: [PATCH 20/54] [Quantization] Enable BNB support for InternS1 (#21953)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../model_loader/bitsandbytes_loader.py       | 39 ++++++++++++-------
 vllm/model_executor/utils.py                  | 20 +++++++++-
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 68fcb785691c8..f54dfab5238e1 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -34,7 +34,8 @@ from vllm.model_executor.model_loader.weight_utils import (
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models import is_pooling_model
-from vllm.model_executor.utils import (get_packed_modules_mapping,
+from vllm.model_executor.utils import (get_moe_expert_mapping,
+                                       get_packed_modules_mapping,
                                        set_weight_attrs)
 from vllm.platforms import current_platform
 
@@ -43,6 +44,12 @@ from vllm.platforms import current_platform
 logger = init_logger(__name__)
 
 
+def is_moe_model(model: torch.nn.Module) -> bool:
+    """Checks if the model contains FusedMoE layers."""
+    return bool(any(
+        isinstance(module, FusedMoE) for module in model.modules()))
+
+
 class BitsAndBytesModelLoader(BaseModelLoader):
     """Model loader to load model weights with BitAndBytes quantization."""
 
@@ -61,6 +68,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         # Store all module names (from transformers) that support
         # BNB quantization.
         self.target_modules: list[str] = []
+        # Store the mapping of expert parameters for MoE models.
+        self.expert_params_mapping: list[tuple[str, str, int, str]] = []
         # mapping weight names from transformers to vllm.
         self.weight_mapper: Callable = lambda name: name
         self.pre_quant: bool = False
@@ -413,13 +422,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                 # in case model has a mixture of disk-merged and disk-split
                 # weights with same last name.
                 self.target_modules.append(name)
-            elif (isinstance(module, FusedMoE)
-                  and hasattr(module.quant_method, "quant_config")):
-                if not hasattr(model, "get_expert_mapping"):
-                    raise AttributeError(
-                        f"MoE Model {type(model).__name__} does not support "
-                        "BitsAndBytes quantization yet. Ensure this model has "
-                        "'get_expert_mapping' method.")
+            elif isinstance(module, FusedMoE) and hasattr(
+                    module.quant_method, "quant_config"):
                 # TODO: support FusedMoE with prequant and 8bit.
                 if self.pre_quant:
                     raise ValueError(
@@ -430,9 +434,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                         "BitsAndBytes 8bit quantization with FusedMoE is not "
                         "supported yet.")
                 # Get the corresponding weight name using module name and
-                # get_expert_mapping.
-                expert_mapping = model.get_expert_mapping()
-                for exp in expert_mapping:
+                # expert_params_mapping.
+
+                for exp in self.expert_params_mapping:
                     weight_name = exp[1]
                     rep_name = name.replace("experts",
                                             "") + weight_name.removesuffix(".")
@@ -464,7 +468,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             elif isinstance(module, (RowParallelLinear, )):
                 self.column_sharded_weights_modules.append(name)
             elif isinstance(module, FusedMoE):
-                expert_mapping = model.get_expert_mapping()
+                expert_mapping = self.expert_params_mapping
                 for exp in expert_mapping:
                     if exp[-1] == "w2":
                         weight_name = exp[1]
@@ -516,6 +520,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         self.is_pool_model = is_pooling_model(model)
         self.modules_mapping = ParamMapping(get_packed_modules_mapping(model))
 
+        if is_moe_model(model):
+            self.expert_params_mapping = get_moe_expert_mapping(model)
+            if not self.expert_params_mapping:
+                raise AttributeError(
+                    f"MoE Model {type(model).__name__} does not support "
+                    "BitsAndBytes quantization yet. Ensure this model has "
+                    "'get_expert_mapping' method.")
         # For some models like Molmo, we need to use hf_to_vllm_mapper
         # to ensure correct loading of weights.
         if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None):
@@ -569,10 +580,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         """
         from bitsandbytes.functional import QuantState
 
-        if not hasattr(model, "get_expert_mapping"):
+        if not self.expert_params_mapping:
             return dict()
 
-        expert_mapping = model.get_expert_mapping()
+        expert_mapping = self.expert_params_mapping
         expert_qs_dict = {}
         for name, module in model.named_modules():
             if not isinstance(module, FusedMoE):
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 2b20ca2a3ba3f..41ed0b09c5a2a 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utils for model executor."""
+
 import copy
 from typing import Any, Optional
 
@@ -9,6 +10,7 @@ import torch
 
 def set_random_seed(seed: int) -> None:
     from vllm.platforms import current_platform
+
     current_platform.seed_everything(seed)
 
 
@@ -29,7 +31,7 @@ def set_weight_attrs(
         return
     for key, value in weight_attrs.items():
         assert not hasattr(
-            weight, key), (f"Overwriting existing tensor attribute: {key}")
+            weight, key), f"Overwriting existing tensor attribute: {key}"
 
         # NOTE(woosuk): During weight loading, we often do something like:
         # narrowed_tensor = param.data.narrow(0, offset, len)
@@ -41,6 +43,7 @@ def set_weight_attrs(
         # we sync the param tensor after its weight loader is called.
         # TODO(woosuk): Remove this hack once we have a better solution.
         from vllm.platforms import current_platform
+
         if current_platform.is_tpu() and key == "weight_loader":
             value = _make_synced_weight_loader(value)
         setattr(weight, key, value)
@@ -77,4 +80,17 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
                 f"safely because of conflicts from {type(child).__name__}.")
         else:
             parent_map.update(child_map)
-    return parent_map
\ No newline at end of file
+    return parent_map
+
+
+def get_moe_expert_mapping(
+    model: torch.nn.Module, ) -> list[tuple[str, str, int, str]]:
+    if parent_map := getattr(model, "get_expert_mapping", None):
+        return parent_map()
+    else:
+        # We only check main components instead of whole model submodules
+        for child in model.children():
+            child_map = getattr(child, "get_expert_mapping", None)
+            if child_map is not None:
+                return child_map()
+        return []

From 87c94bc87943818ad039d5c916df793fbd081e6a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 1 Aug 2025 13:24:46 +0100
Subject: [PATCH 21/54] Revert "Update sampling_metadata.py (#21937)" (#22088)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/sampling_metadata.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 66bcf1c4bfe50..56f0f0984bfa0 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -539,37 +539,37 @@ class SamplingTensors:
         temperatures_t = torch.tensor(
             temperatures,
             device="cpu",
-            dtype=torch.float32,
+            dtype=dtype,
             pin_memory=pin_memory,
         )
         top_ps_t = torch.tensor(
             top_ps,
             device="cpu",
-            dtype=torch.float32,
+            dtype=dtype,
             pin_memory=pin_memory,
         )
         min_ps_t = torch.tensor(
             min_ps,
             device="cpu",
-            dtype=torch.float32,
+            dtype=dtype,
             pin_memory=pin_memory,
         )
         presence_penalties_t = torch.tensor(
             presence_penalties,
             device="cpu",
-            dtype=torch.float32,
+            dtype=dtype,
             pin_memory=pin_memory,
         )
         frequency_penalties_t = torch.tensor(
             frequency_penalties,
             device="cpu",
-            dtype=torch.float32,
+            dtype=dtype,
             pin_memory=pin_memory,
         )
         repetition_penalties_t = torch.tensor(
             repetition_penalties,
             device="cpu",
-            dtype=torch.float32,
+            dtype=dtype,
             pin_memory=pin_memory,
         )
         top_ks_t = torch.tensor(

From dfbc1f88807a1bddb75fc1dd587922567d7c133f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 1 Aug 2025 08:25:18 -0400
Subject: [PATCH 22/54] [Speculative Decoding] Add `speculators` config support
 (#21345)

---
 .../speculators/test_eagle3.py                | 16 ++++
 vllm/config.py                                | 20 +++-
 vllm/engine/arg_utils.py                      | 22 ++++-
 vllm/model_executor/models/llama_eagle3.py    | 26 +++++-
 vllm/transformers_utils/config.py             | 32 ++++++-
 vllm/transformers_utils/configs/__init__.py   |  2 +
 .../configs/speculators/__init__.py           |  2 +
 .../configs/speculators/algos.py              | 32 +++++++
 .../configs/speculators/base.py               | 91 +++++++++++++++++++
 9 files changed, 232 insertions(+), 11 deletions(-)
 create mode 100644 tests/speculative_decoding/speculators/test_eagle3.py
 create mode 100644 vllm/transformers_utils/configs/speculators/__init__.py
 create mode 100644 vllm/transformers_utils/configs/speculators/algos.py
 create mode 100644 vllm/transformers_utils/configs/speculators/base.py

diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py
new file mode 100644
index 0000000000000..c58fc8c0dc5f4
--- /dev/null
+++ b/tests/speculative_decoding/speculators/test_eagle3.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717"),
+     ("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
+def test_llama(vllm_runner, example_prompts, model_path):
+    with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                  max_tokens=20)
+        print(vllm_outputs)
+        assert vllm_outputs
diff --git a/vllm/config.py b/vllm/config.py
index 93daab7d6ae97..2d61552c5dadc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -39,8 +39,8 @@ from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    try_get_generation_config, try_get_safetensors_metadata,
-    try_get_tokenizer_config, uses_mrope)
+    maybe_override_with_speculators_target_model, try_get_generation_config,
+    try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 # yapf conflicts with isort for this block
@@ -535,6 +535,15 @@ class ModelConfig:
                     "affect the random state of the Python process that "
                     "launched vLLM.", self.seed)
 
+        if self.runner != "draft":
+            # If we're not running the draft model, check for speculators config
+            # If speculators config, set model / tokenizer to be target model
+            self.model, self.tokenizer = maybe_override_with_speculators_target_model(  # noqa: E501
+                model=self.model,
+                tokenizer=self.tokenizer,
+                revision=self.revision,
+                trust_remote_code=self.trust_remote_code)
+
         # Keep set served_model_name before maybe_model_redirect(self.model)
         self.served_model_name = get_served_model_name(self.model,
                                                        self.served_model_name)
@@ -606,8 +615,8 @@ class ModelConfig:
                                self.config_format,
                                hf_overrides_kw=hf_overrides_kw,
                                hf_overrides_fn=hf_overrides_fn)
-        self.hf_config = hf_config
 
+        self.hf_config = hf_config
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.attention_chunk_size = getattr(self.hf_text_config,
                                             "attention_chunk_size", None)
@@ -2980,10 +2989,13 @@ class SpeculativeConfig:
                             "Chunked prefill and EAGLE are not compatible "
                             "when using V0.")
 
+                    from vllm.transformers_utils.configs import (
+                        SpeculatorsConfig)
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
+
                     if isinstance(self.draft_model_config.hf_config,
-                                  EAGLEConfig):
+                                  (EAGLEConfig, SpeculatorsConfig)):
                         pass
                     else:
                         eagle_config = EAGLEConfig(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 78272d983eaf5..efa077a88270a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -978,8 +978,28 @@ class EngineArgs:
         provided as a JSON string input via CLI arguments or directly as a
         dictionary from the engine.
         """
+
+        from vllm.transformers_utils.config import get_config
+        from vllm.transformers_utils.configs.speculators.base import (
+            SpeculatorsConfig)
+
         if self.speculative_config is None:
-            return None
+            hf_config = get_config(self.hf_config_path or self.model,
+                                   self.trust_remote_code, self.revision,
+                                   self.code_revision, self.config_format)
+
+            # if loading a SpeculatorsConfig, load the specualtive_config
+            # details from the config directly
+            # no user input required / expected
+            if isinstance(hf_config, SpeculatorsConfig):
+                # We create one since we dont create one
+                self.speculative_config = {}
+                self.speculative_config[
+                    "num_speculative_tokens"] = hf_config.num_lookahead_tokens
+                self.speculative_config["model"] = self.model
+                self.speculative_config["method"] = hf_config.method
+            else:
+                return None
 
         # Note(Shangming): These parameters are not obtained from the cli arg
         # '--speculative-config' and must be passed in when creating the engine
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 71275f0d58579..572930c39a846 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -51,6 +51,25 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
 
         self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+        if getattr(config, "norm_before_residual", False):
+            self._residual_norm = self._norm_before_residual
+        else:
+            self._residual_norm = self._norm_after_residual
+
+    def _norm_before_residual(
+            self,
+            hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states = self.hidden_norm(hidden_states)
+        residual = hidden_states
+        return hidden_states, residual
+
+    def _norm_after_residual(
+            self,
+            hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.hidden_norm(hidden_states)
+        return hidden_states, residual
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -59,9 +78,10 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         residual: Optional[torch.Tensor],
     ) -> tuple[torch.Tensor, torch.Tensor]:
 
-        residual = hidden_states
         embeds = self.input_layernorm(embeds)
-        hidden_states = self.hidden_norm(hidden_states)
+
+        hidden_states, residual = self._residual_norm(
+            hidden_states=hidden_states)
 
         hidden_states = torch.cat([embeds, hidden_states], dim=-1)
         # Self Attention
@@ -102,7 +122,7 @@ class LlamaModel(nn.Module):
 
         self.layers = nn.ModuleList([
             LlamaDecoderLayer(
-                self.config,
+                config=self.config,
                 prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
             )
         ])
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index fcaa48c1392a3..0e633c2c0b6ae 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -35,8 +35,9 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
                                              MllamaConfig, MLPSpeculatorConfig,
                                              Nemotron_Nano_VL_Config,
                                              NemotronConfig, NVLM_D_Config,
-                                             RWConfig, Step3TextConfig,
-                                             Step3VLConfig, UltravoxConfig)
+                                             RWConfig, SpeculatorsConfig,
+                                             Step3TextConfig, Step3VLConfig,
+                                             UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.configs.mistral import adapt_config_dict
 from vllm.transformers_utils.utils import check_gguf_file
@@ -81,6 +82,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
     "eagle": EAGLEConfig,
+    "speculators": SpeculatorsConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
     "ultravox": UltravoxConfig,
@@ -287,6 +289,27 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
     return config
 
 
+def maybe_override_with_speculators_target_model(
+        model: str,
+        tokenizer: str,
+        trust_remote_code: bool,
+        revision: Optional[str] = None) -> tuple[str, str]:
+    """
+    If running a speculators config, override running model with target model
+    """
+    config_dict, _ = PretrainedConfig.get_config_dict(
+        model,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        token=_get_hf_token(),
+    )
+    spec_config = config_dict.get("speculators_config")
+    # Return the target model
+    if spec_config is not None:
+        model = tokenizer = spec_config["verifier"]["name_or_path"]
+    return model, tokenizer
+
+
 def get_config(
     model: Union[str, Path],
     trust_remote_code: bool,
@@ -345,9 +368,12 @@ def get_config(
             token=_get_hf_token(),
             **kwargs,
         )
-
         # Use custom model class if it's in our registry
         model_type = config_dict.get("model_type")
+        if model_type is None:
+            model_type = "speculators" if config_dict.get(
+                "speculators_config") is not None else model_type
+
         if model_type in _CONFIG_REGISTRY:
             config_class = _CONFIG_REGISTRY[model_type]
             config = config_class.from_pretrained(
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 96733da726181..64ace167a5a00 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -24,6 +24,7 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
 from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
                                                       Step3VisionEncoderConfig,
                                                       Step3VLConfig)
@@ -44,6 +45,7 @@ __all__ = [
     "NemotronHConfig",
     "Nemotron_Nano_VL_Config",
     "NVLM_D_Config",
+    "SpeculatorsConfig",
     "UltravoxConfig",
     "Step3VLConfig",
     "Step3VisionEncoderConfig",
diff --git a/vllm/transformers_utils/configs/speculators/__init__.py b/vllm/transformers_utils/configs/speculators/__init__.py
new file mode 100644
index 0000000000000..208f01a7cb5ee
--- /dev/null
+++ b/vllm/transformers_utils/configs/speculators/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py
new file mode 100644
index 0000000000000..efc87b6bcf26f
--- /dev/null
+++ b/vllm/transformers_utils/configs/speculators/algos.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+SUPPORTED_SPECULATORS_TYPES = {}
+
+
+def register_speculator(name):
+
+    def decorator(fn):
+        SUPPORTED_SPECULATORS_TYPES[name] = fn
+        return fn
+
+    return decorator
+
+
+@register_speculator("eagle3")
+def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
+    """
+    Apply Eagle-3 specific configuration transformations.
+    
+    Eagle-3 specific fields:
+    - draft_vocab_size: Size of the draft model's vocabulary
+    - target_hidden_size: Hidden size of the target model
+    - norm_before_residual: Whether to apply norm before residual connection
+    """
+
+    vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
+    if config_dict.get("target_hidden_size") is not None:
+        vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
+    vllm_config["norm_before_residual"] = config_dict.get(
+        "norm_before_residual", True)
+    vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py
new file mode 100644
index 0000000000000..d7c16e180c709
--- /dev/null
+++ b/vllm/transformers_utils/configs/speculators/base.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import Any, Union
+
+from transformers import PretrainedConfig
+
+from vllm.transformers_utils.configs.speculators.algos import (
+    SUPPORTED_SPECULATORS_TYPES)
+
+__all__ = ["SpeculatorsConfig"]
+
+
+class SpeculatorsConfig(PretrainedConfig):
+    model_type = "speculators"
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "SpeculatorsConfig":
+        """Load speculators Eagle config and convert to vLLM format."""
+        config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
+                                             **kwargs)
+
+        speculators_model_type = config_dict.get("speculators_model_type")
+        if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
+            raise ValueError(
+                f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
+                "Please ensure you're loading a speculators-format model.")
+
+        # validate fields
+        # TODO: @dsikka - use speculators pydantic model to validate
+        cls.validate_speculators_config(config_dict=config_dict)
+        # Convert from speculators config -> format that can be ingested by vLLM
+        vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict)
+        # Apply anything specific to the supported algorithm
+        algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
+        algo_updater(config_dict=config_dict, vllm_config=vllm_config)
+        return cls(**vllm_config)
+
+    @classmethod
+    def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None:
+        try:
+            spec_config = config_dict["speculators_config"]
+            methods = spec_config["proposal_methods"]
+            first_method = methods[0]
+            _ = first_method["speculative_tokens"]
+            _ = spec_config["verifier"]["name_or_path"]
+            _ = config_dict["speculators_model_type"]
+        except (KeyError, IndexError, TypeError) as e:
+            raise ValueError("Invalid speculators config structure") from e
+
+        if "transformer_layer_config" not in config_dict:
+            raise ValueError("Must provide transformer_layer_config")
+
+        if not isinstance(config_dict["transformer_layer_config"], dict):
+            raise TypeError(
+                "'transformer_layer_config' must be a dictionary if provided")
+
+    @classmethod
+    def convert_speculators_to_vllm(
+            cls, config_dict: dict[str, Any]) -> dict[str, Any]:
+        """
+        Convert speculators config format to vLLM format.
+        
+        This method handles the translation of field names and structure
+        between speculators and vLLM formats.
+        
+        Returns:
+            Dictionary with vLLM-compatible configuration
+        """
+        # Currently we only support one proposal method
+        spec_config = config_dict["speculators_config"]
+        first_method = spec_config.get("proposal_methods")[0]
+        num_lookahead_tokens = first_method.get("speculative_tokens")
+
+        if num_lookahead_tokens is None:
+            raise ValueError(
+                "Missing 'speculative_tokens' in proposal method. "
+                f"Got: {first_method}")
+
+        # Build base vLLM config
+        vllm_config = {
+            "method": config_dict.get("speculators_model_type"),
+            "num_lookahead_tokens": num_lookahead_tokens,
+            "target_model": spec_config.get("verifier")["name_or_path"]
+        }
+        vllm_config.update(config_dict["transformer_layer_config"])
+        return vllm_config

From 26b5f7bd2a4005dccb797804c93cbce329253003 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Fri, 1 Aug 2025 05:25:20 -0700
Subject: [PATCH 23/54] [BUG] [ROCm] Fix import bug on ROCm (#22083)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/compilation/pass_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 11e03daced160..54f00d5415216 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -7,7 +7,7 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-if current_platform.is_cuda_alike():
+if current_platform.is_cuda():
     from .fusion import FusionPass
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
     from .fusion_attn import AttnFusionPass

From fb0e0d46fc443f08bc2a859b839f0f66c6a7f670 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 1 Aug 2025 13:26:42 +0100
Subject: [PATCH 24/54] Fix `get_kwargs` for case where type hint is
 `list[Union[str, type]]` (#22016)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/engine/test_arg_utils.py |  7 ++++++-
 vllm/engine/arg_utils.py       | 10 ++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 1d1926068d28c..c282bf002304a 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -5,7 +5,7 @@ import json
 from argparse import ArgumentError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Literal, Optional, Union
 
 import pytest
 
@@ -136,6 +136,8 @@ class DummyConfig:
     """List with variable length"""
     list_literal: list[Literal[1, 2]] = field(default_factory=list)
     """List with literal choices"""
+    list_union: list[Union[str, type[object]]] = field(default_factory=list)
+    """List with union type"""
     literal_literal: Literal[Literal[1], Literal[2]] = 1
     """Literal of literals with default 1"""
     json_tip: dict = field(default_factory=dict)
@@ -187,6 +189,9 @@ def test_get_kwargs():
     assert kwargs["list_literal"]["type"] is int
     assert kwargs["list_literal"]["nargs"] == "+"
     assert kwargs["list_literal"]["choices"] == [1, 2]
+    # lists with unions should become str type.
+    # If not, we cannot know which type to use for parsing
+    assert kwargs["list_union"]["type"] is str
     # literals of literals should have merged choices
     assert kwargs["literal_literal"]["choices"] == [1, 2]
     # dict should have json tip in help
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index efa077a88270a..f938f19b90469 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -217,10 +217,12 @@ Additionally, list elements can be passed individually using `+`:
         elif contains_type(type_hints, list):
             type_hint = get_type(type_hints, list)
             types = get_args(type_hint)
-            assert len(types) == 1, (
-                "List type must have exactly one type. Got "
-                f"{type_hint} with types {types}")
-            kwargs[name]["type"] = types[0]
+            list_type = types[0]
+            if get_origin(list_type) is Union:
+                msg = "List type must contain str if it is a Union."
+                assert str in get_args(list_type), msg
+                list_type = str
+            kwargs[name]["type"] = list_type
             kwargs[name]["nargs"] = "+"
         elif contains_type(type_hints, int):
             kwargs[name]["type"] = int

From f81c1bb05504672ddd66905161c6ada549fd4b85 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 1 Aug 2025 08:28:45 -0400
Subject: [PATCH 25/54] [Bugfix] Check NVIDIA artifactory is accessible before
 using flashinfer cubin kernels (#21893)

---
 vllm/attention/backends/flashinfer.py    | 46 +-------------
 vllm/utils/flashinfer.py                 | 81 +++++++++++++++++++++++-
 vllm/v1/attention/backends/flashinfer.py | 49 +-------------
 vllm/v1/attention/backends/mla/common.py | 16 ++---
 4 files changed, 93 insertions(+), 99 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 824ff8cca201a..b3372ce2eca8c 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -44,9 +44,9 @@ from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
+from vllm.utils.flashinfer import use_trtllm_decode_attention
 
 logger = init_logger(__name__)
 
@@ -56,7 +56,6 @@ if TYPE_CHECKING:
 
 
 class FlashInferBackend(AttentionBackend):
-    cached_sm100a_supported: Optional[bool] = None
 
     @staticmethod
     def get_name() -> str:
@@ -123,47 +122,6 @@ class FlashInferBackend(AttentionBackend):
         else:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
-    @staticmethod
-    def use_trtllm_decode_attention(
-        batch_size: int,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_qo_heads: Optional[int],
-        num_kv_heads: Optional[int],
-        attn_head_size: Optional[int],
-    ) -> bool:
-        if FlashInferBackend.cached_sm100a_supported is None:
-            FlashInferBackend.cached_sm100a_supported = (
-                current_platform.has_device_capability(100))
-        if not FlashInferBackend.cached_sm100a_supported:
-            return False
-        # Check if the dimensions are supported by TRTLLM decode attention
-        if (attn_head_size is None or num_qo_heads is None
-                or num_kv_heads is None or num_qo_heads // num_kv_heads > 8
-                or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
-            return False
-        env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
-        if env_value is not None:
-            logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
-                             env_value)
-            # Environment variable is set - respect it
-            # Making the conditional check for zero because
-            # the path is automatically enabled if the batch size condition
-            # is satisfied.
-            no_use_trtllm = (env_value == "0")
-            if not no_use_trtllm:
-                logger.info_once("Using TRTLLM decode attention.")
-            return not no_use_trtllm
-        else:
-            # Environment variable not set - use auto-detection
-            use_trtllm = (FlashInferBackend.cached_sm100a_supported
-                          and batch_size <= 256 and max_seq_len < 131072
-                          and kv_cache_dtype == "auto")
-            if use_trtllm:
-                logger.warning_once(
-                    "Using TRTLLM decode attention (auto-detected).")
-        return use_trtllm
-
 
 @dataclass
 class PerLayerParameters:
@@ -1156,7 +1114,7 @@ class FlashInferImpl(AttentionImpl):
             assert decode_meta.decode_wrapper._sm_scale == softmax_scale
             # TODO: @pavanimajety Remove this once the switch happens
             # inside flashinfer.
-            if not FlashInferBackend.use_trtllm_decode_attention(
+            if not use_trtllm_decode_attention(
                     num_decode_tokens, attn_metadata.max_decode_seq_len,
                     kv_cache_dtype, attn_metadata.num_qo_heads,
                     attn_metadata.num_kv_heads, attn_metadata.head_dim):
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 3bfb9808c0a00..29967bc516715 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -10,12 +10,25 @@ import contextlib
 import functools
 import importlib
 import importlib.util
-from typing import Any, Callable, NoReturn
+import os
+from typing import Any, Callable, NoReturn, Optional
 
+import requests
+
+import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
+# This is the storage path for the cubins, it can be replaced
+# with a local path for testing.
+# Referenced from https://github.com/flashinfer-ai/flashinfer/blob/0c9a92c3d9a7e043ab6f3f7b2273269caf6ab044/flashinfer/jit/cubin_loader.py#L35  # noqa: E501
+FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
+    "FLASHINFER_CUBINS_REPOSITORY",
+    "https://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/",  # noqa: E501
+)
+
 
 @functools.cache
 def has_flashinfer() -> bool:
@@ -108,6 +121,70 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
     return True
 
 
+@functools.cache
+def has_nvidia_artifactory() -> bool:
+    """Return ``True`` if NVIDIA's artifactory is accessible.
+    
+    This checks connectivity to the kernel inference library artifactory
+    which is required for downloading certain cubin kernels like TRTLLM FHMA.
+    """
+    try:
+        # Use a short timeout to avoid blocking for too long
+        response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5)
+        accessible = response.status_code == 200
+        if accessible:
+            logger.debug_once("NVIDIA artifactory is accessible")
+        else:
+            logger.warning_once(
+                "NVIDIA artifactory returned failed status code: %d",
+                response.status_code)
+        return accessible
+    except Exception as e:
+        logger.warning_once("Failed to connect to NVIDIA artifactory: %s", e)
+        return False
+
+
+def use_trtllm_decode_attention(
+    num_tokens: int,
+    max_seq_len: int,
+    kv_cache_dtype: str,
+    num_qo_heads: Optional[int],
+    num_kv_heads: Optional[int],
+    attn_head_size: Optional[int],
+) -> bool:
+    # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
+    if not (current_platform.is_device_capability(100)
+            and has_nvidia_artifactory()):
+        return False
+
+    # Check if the dimensions are supported by TRTLLM decode attention
+    if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None
+            or num_qo_heads // num_kv_heads > 8
+            or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
+        return False
+
+    env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
+    if env_value is not None:
+        logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
+                         env_value)
+        # Environment variable is set - respect it
+        # Making the conditional check for zero because
+        # the path is automatically enabled if the batch size condition
+        # is satisfied.
+        no_use_trtllm = (env_value == "0")
+        if not no_use_trtllm:
+            logger.info_once("Using TRTLLM decode attention.")
+        return not no_use_trtllm
+    else:
+        # Environment variable not set - use auto-detection
+        use_trtllm = (num_tokens <= 256 and max_seq_len < 131072
+                      and kv_cache_dtype == "auto")
+        if use_trtllm:
+            logger.warning_once(
+                "Using TRTLLM decode attention (auto-detected).")
+        return use_trtllm
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -117,4 +194,6 @@ __all__ = [
     "autotune",
     "has_flashinfer_moe",
     "has_flashinfer_cutlass_fused_moe",
+    "has_nvidia_artifactory",
+    "use_trtllm_decode_attention",
 ]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 27552f0e7c1ef..f8af1d7e41831 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -17,8 +17,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import cdiv
+from vllm.utils.flashinfer import use_trtllm_decode_attention
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout,
@@ -38,7 +38,6 @@ logger = init_logger(__name__)
 class FlashInferBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
-    cached_sm100a_supported: Optional[bool] = None
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
@@ -98,48 +97,6 @@ class FlashInferBackend(AttentionBackend):
             raise ValueError(f"Unknown cache layout format {cache_layout}.")
         return stride_order
 
-    @staticmethod
-    def use_trtllm_decode_attention(
-        batch_size: int,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_qo_heads: int,
-        num_kv_heads: int,
-        attn_head_size: int,
-    ) -> bool:
-        if FlashInferBackend.cached_sm100a_supported is None:
-            FlashInferBackend.cached_sm100a_supported = (
-                current_platform.has_device_capability(100))
-        if not FlashInferBackend.cached_sm100a_supported:
-            return False
-        if (num_qo_heads // num_kv_heads > 8
-                or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
-            return False
-        env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
-        if env_value is not None:
-            logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
-                             env_value)
-            # Environment variable is set - respect it
-            # Making the conditional check for zero because
-            # the path is automatically enabled if the batch size condition
-            # is satisfied.
-            no_use_trtllm = env_value == "0"
-            if not no_use_trtllm:
-                logger.info_once(
-                    "VLLM_USE_TRTLLM_DECODE_ATTENTION is set to 1, "
-                    "using TRTLLM decode attention.")
-            return not no_use_trtllm
-        else:
-            # Environment variable not set - use auto-detection
-            # Only supports attention head size of 128
-            use_trtllm = (FlashInferBackend.cached_sm100a_supported
-                          and batch_size <= 256 and max_seq_len < 131072
-                          and kv_cache_dtype == "auto")
-            if use_trtllm:
-                logger.warning_once(
-                    "Using TRTLLM decode attention (auto-detected).")
-        return use_trtllm
-
     @staticmethod
     def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
@@ -352,7 +309,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
             if num_decodes > 0:
                 attn_metadata.decode_wrapper = self._get_decode_wrapper()
-                if not FlashInferBackend.use_trtllm_decode_attention(
+                if not use_trtllm_decode_attention(
                         num_decodes, attn_metadata.max_seq_len,
                         self.cache_config.cache_dtype,
                         attn_metadata.num_qo_heads, attn_metadata.num_kv_heads,
@@ -636,7 +593,7 @@ class FlashInferImpl(AttentionImpl):
             decode_query = query[:num_decode_tokens]
             assert decode_query.shape[0] == num_decode_tokens
             assert decode_wrapper is not None
-            if not FlashInferBackend.use_trtllm_decode_attention(
+            if not use_trtllm_decode_attention(
                     attn_metadata.num_decodes, attn_metadata.max_seq_len,
                     self.kv_cache_dtype, attn_metadata.num_qo_heads,
                     attn_metadata.num_kv_heads, attn_metadata.head_dim):
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 0095d75217856..d112468f1c91d 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -209,6 +209,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                UnquantizedLinearMethod)
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
+from vllm.utils.flashinfer import has_nvidia_artifactory
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder, CommonAttentionMetadata,
     get_per_layer_parameters, infer_global_hyperparameters,
@@ -379,17 +380,16 @@ M = TypeVar("M", bound=MLACommonMetadata)
 
 
 def use_flashinfer_prefill() -> bool:
-    if flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL:
-        # For blackwell default to flashinfer prefill if its available since
-        #  its faster than FA2.
-        return current_platform.has_device_capability(100)
-    return False
+    # For blackwell default to flashinfer prefill if its available since
+    # it is faster than FA2.
+    return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
+            and current_platform.is_device_capability(100))
 
 
 def use_cudnn_prefill() -> bool:
-    if flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL:
-        return current_platform.has_device_capability(100)
-    return False
+    return (flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL
+            and current_platform.is_device_capability(100)
+            and has_nvidia_artifactory())
 
 
 # Currently 394MB, this can be tuned based on GEMM sizes used.

From 0a6d305e0f7b63b06c87bb1f7564ae8d148a3311 Mon Sep 17 00:00:00 2001
From: Gamhang <leeebucks@gmail.com>
Date: Fri, 1 Aug 2025 21:07:33 +0800
Subject: [PATCH 26/54] feat(multimodal): Add customizable background color for
 RGBA to RGB conversion (#22052)

Signed-off-by: Jinheng Li <ahengljh@gmail.com>
Co-authored-by: Jinheng Li <ahengljh@gmail.com>
---
 docs/features/multimodal_inputs.md |  44 +++++++++++
 tests/multimodal/test_image.py     | 115 ++++++++++++++++++++++++++++-
 vllm/multimodal/image.py           |  37 ++++++++--
 3 files changed, 190 insertions(+), 6 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index b8677f11a1d3c..cdd32924b5668 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -172,6 +172,36 @@ Multi-image input can be extended to perform video captioning. We show this with
         print(generated_text)
     ```
 
+#### Custom RGBA Background Color
+
+When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    
+    # Default white background (no configuration needed)
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    
+    # Custom black background for dark theme
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
+    )
+    
+    # Custom brand color background (e.g., blue)
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf", 
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
+    )
+    ```
+
+!!! note
+    - The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255
+    - This setting only affects RGBA images with transparency; RGB images are unchanged
+    - If not specified, the default white background `(255, 255, 255)` is used for backward compatibility
+
 ### Video Inputs
 
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
@@ -478,6 +508,20 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
     export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
     ```
 
+#### Custom RGBA Background Color
+
+To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`:
+
+```bash
+# Example: Black background for dark theme
+vllm serve llava-hf/llava-1.5-7b-hf \
+  --media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}'
+
+# Example: Custom gray background
+vllm serve llava-hf/llava-1.5-7b-hf \
+  --media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}'
+```
+
 ### Audio Inputs
 
 Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
index cfd44351a6d1f..271a85f1195ec 100644
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
@@ -3,9 +3,10 @@
 from pathlib import Path
 
 import numpy as np
+import pytest
 from PIL import Image, ImageChops
 
-from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.image import ImageMediaIO, convert_image_mode
 
 ASSETS_DIR = Path(__file__).parent / "assets"
 assert ASSETS_DIR.exists()
@@ -35,3 +36,115 @@ def test_rgba_to_rgb():
                 assert converted_image_numpy[i][j][0] == 255
                 assert converted_image_numpy[i][j][1] == 255
                 assert converted_image_numpy[i][j][2] == 255
+
+
+def test_rgba_to_rgb_custom_background(tmp_path):
+    """Test RGBA to RGB conversion with custom background colors."""
+    # Create a simple RGBA image with transparent and opaque pixels
+    rgba_image = Image.new("RGBA", (10, 10),
+                           (255, 0, 0, 255))  # Red with full opacity
+
+    # Make top-left quadrant transparent
+    for i in range(5):
+        for j in range(5):
+            rgba_image.putpixel((i, j), (0, 0, 0, 0))  # Fully transparent
+
+    # Save the test image to tmp_path
+    test_image_path = tmp_path / "test_rgba.png"
+    rgba_image.save(test_image_path)
+
+    # Test 1: Default white background (backward compatibility)
+    image_io_default = ImageMediaIO()
+    converted_default = image_io_default.load_file(test_image_path)
+    default_numpy = np.array(converted_default)
+
+    # Check transparent pixels are white
+    assert default_numpy[0][0][0] == 255  # R
+    assert default_numpy[0][0][1] == 255  # G
+    assert default_numpy[0][0][2] == 255  # B
+    # Check opaque pixels remain red
+    assert default_numpy[5][5][0] == 255  # R
+    assert default_numpy[5][5][1] == 0  # G
+    assert default_numpy[5][5][2] == 0  # B
+
+    # Test 2: Custom black background via kwargs
+    image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
+    converted_black = image_io_black.load_file(test_image_path)
+    black_numpy = np.array(converted_black)
+
+    # Check transparent pixels are black
+    assert black_numpy[0][0][0] == 0  # R
+    assert black_numpy[0][0][1] == 0  # G
+    assert black_numpy[0][0][2] == 0  # B
+    # Check opaque pixels remain red
+    assert black_numpy[5][5][0] == 255  # R
+    assert black_numpy[5][5][1] == 0  # G
+    assert black_numpy[5][5][2] == 0  # B
+
+    # Test 3: Custom blue background via kwargs (as list)
+    image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
+    converted_blue = image_io_blue.load_file(test_image_path)
+    blue_numpy = np.array(converted_blue)
+
+    # Check transparent pixels are blue
+    assert blue_numpy[0][0][0] == 0  # R
+    assert blue_numpy[0][0][1] == 0  # G
+    assert blue_numpy[0][0][2] == 255  # B
+
+    # Test 4: Test with load_bytes method
+    with open(test_image_path, 'rb') as f:
+        image_data = f.read()
+
+    image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
+    converted_green = image_io_green.load_bytes(image_data)
+    green_numpy = np.array(converted_green)
+
+    # Check transparent pixels are green
+    assert green_numpy[0][0][0] == 0  # R
+    assert green_numpy[0][0][1] == 255  # G
+    assert green_numpy[0][0][2] == 0  # B
+
+
+def test_rgba_background_color_validation():
+    """Test that invalid rgba_background_color values are properly rejected."""
+
+    # Test invalid types
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color="255,255,255")
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=255)
+
+    # Test wrong number of elements
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, 255))
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
+
+    # Test non-integer values
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, "255", 255))
+
+    # Test out of range values
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(256, 255, 255))
+
+    with pytest.raises(ValueError,
+                       match="rgba_background_color must be a list or tuple"):
+        ImageMediaIO(rgba_background_color=(255, -1, 255))
+
+    # Test that valid values work
+    ImageMediaIO(rgba_background_color=(0, 0, 0))  # Should not raise
+    ImageMediaIO(rgba_background_color=[255, 255, 255])  # Should not raise
+    ImageMediaIO(rgba_background_color=(128, 128, 128))  # Should not raise
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index a0448a80ac7c2..1006c1ce4b241 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,6 +3,7 @@
 
 from io import BytesIO
 from pathlib import Path
+from typing import Union
 
 import pybase64
 import torch
@@ -23,9 +24,10 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
-# TODO: Support customizable background color to fill in.
 def rgba_to_rgb(
-    image: Image.Image, background_color=(255, 255, 255)) -> Image.Image:
+    image: Image.Image,
+    background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255)
+) -> Image.Image:
     """Convert an RGBA image to RGB with filled background color."""
     assert image.mode == "RGBA"
     converted = Image.new("RGB", image.size, background_color)
@@ -55,10 +57,35 @@ class ImageMediaIO(MediaIO[Image.Image]):
         # for flexible control.
         self.kwargs = kwargs
 
+        # Extract RGBA background color from kwargs if provided
+        # Default to white background for backward compatibility
+        rgba_bg = kwargs.get('rgba_background_color', (255, 255, 255))
+        # Convert list to tuple for consistency
+        if isinstance(rgba_bg, list):
+            rgba_bg = tuple(rgba_bg)
+
+        # Validate rgba_background_color format
+        if not (isinstance(rgba_bg, tuple) and len(rgba_bg) == 3
+                and all(isinstance(c, int) and 0 <= c <= 255
+                        for c in rgba_bg)):
+            raise ValueError(
+                "rgba_background_color must be a list or tuple of 3 integers "
+                "in the range [0, 255].")
+        self.rgba_background_color = rgba_bg
+
+    def _convert_image_mode(self, image: Image.Image) -> Image.Image:
+        """Convert image mode with custom background color."""
+        if image.mode == self.image_mode:
+            return image
+        elif image.mode == "RGBA" and self.image_mode == "RGB":
+            return rgba_to_rgb(image, self.rgba_background_color)
+        else:
+            return convert_image_mode(image, self.image_mode)
+
     def load_bytes(self, data: bytes) -> Image.Image:
         image = Image.open(BytesIO(data))
         image.load()
-        return convert_image_mode(image, self.image_mode)
+        return self._convert_image_mode(image)
 
     def load_base64(self, media_type: str, data: str) -> Image.Image:
         return self.load_bytes(pybase64.b64decode(data, validate=True))
@@ -66,7 +93,7 @@ class ImageMediaIO(MediaIO[Image.Image]):
     def load_file(self, filepath: Path) -> Image.Image:
         image = Image.open(filepath)
         image.load()
-        return convert_image_mode(image, self.image_mode)
+        return self._convert_image_mode(image)
 
     def encode_base64(
         self,
@@ -77,7 +104,7 @@ class ImageMediaIO(MediaIO[Image.Image]):
         image = media
 
         with BytesIO() as buffer:
-            image = convert_image_mode(image, self.image_mode)
+            image = self._convert_image_mode(image)
             image.save(buffer, image_format)
             data = buffer.getvalue()
 

From 5c54d9759d3e12d66919826bf1b7c196914d3a92 Mon Sep 17 00:00:00 2001
From: Abirdcfly <fp544037857@gmail.com>
Date: Fri, 1 Aug 2025 21:08:45 +0800
Subject: [PATCH 27/54] [Bugfix][PD] set max_completion_tokens=1 if req has
 this value (#21841)

Signed-off-by: Abirdcfly <fp544037857@gmail.com>
---
 .../online_serving/disaggregated_serving/disagg_proxy_demo.py   | 2 ++
 .../disagg_proxy_p2p_nccl_xpyd.py                               | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index 16c32dcaa5d31..d39edb0b9d15c 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -293,6 +293,8 @@ class Proxy:
             # add params to request
             kv_prepare_request = request.copy()
             kv_prepare_request["max_tokens"] = 1
+            if "max_completion_tokens" in kv_prepare_request:
+                kv_prepare_request["max_completion_tokens"] = 1
 
             # prefill stage
             prefill_instance = self.schedule(self.prefill_cycler)
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
index a6fd92feb2f11..73da7af85f1d9 100644
--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -128,6 +128,8 @@ async def handle_request():
         prefill_request = original_request_data.copy()
         # change max_tokens = 1 to let it only do prefill
         prefill_request["max_tokens"] = 1
+        if "max_completion_tokens" in prefill_request:
+            prefill_request["max_completion_tokens"] = 1
 
         global count
         global prefill_instances

From a59cd9d9f7fd89e19beeffb7e7f89437d413eafb Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 1 Aug 2025 09:10:30 -0400
Subject: [PATCH 28/54] [Refactor] Fix Compile Warning #1444-D (#21462)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/moe/topk_softmax_kernels.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 0b505d2e04a21..7a7865b901de1 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -24,9 +24,12 @@
 #ifndef USE_ROCM
     #include <cub/util_type.cuh>
     #include <cub/cub.cuh>
+    #include <cuda/std/functional>
+    using AddOp = cuda::std::plus<float>;
 #else
     #include <hipcub/util_type.hpp>
     #include <hipcub/hipcub.hpp>
+    using AddOp = cub::Sum; 
 #endif
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -62,7 +65,6 @@ __launch_bounds__(TPB) __global__
 
     const int thread_row_offset = blockIdx.x * num_cols;
 
-    cub::Sum sum;
     float threadData(-FLT_MAX);
 
     // Don't touch finished rows.
@@ -92,7 +94,7 @@ __launch_bounds__(TPB) __global__
         threadData += exp((static_cast<float>(input[idx]) - float_max));
     }
 
-    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
+    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
 
     if (threadIdx.x == 0)
     {

From 8026a335a135af2e53c7d89652863312d7a3c936 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Fri, 1 Aug 2025 10:11:29 -0400
Subject: [PATCH 29/54] [BugFix] Update AttnFusionPass cache key (#21947)

Signed-off-by: Richard Zou <zou3519@gmail.com>
---
 vllm/compilation/fusion_attn.py   | 3 +++
 vllm/compilation/inductor_pass.py | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 79518b6f4f965..a40a8caf34a88 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -164,3 +164,6 @@ class AttnFusionPass(VllmInductorPass):
         logger.debug("Fused quantization onto %s attention nodes", count)
         self.dump_graph(graph, "after_attn_fusion")
         self.end_and_log()
+
+    def uuid(self):
+        return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern)
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 810d0801e9f38..2a149c65b3877 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -76,9 +76,10 @@ class InductorPass(CustomGraphPass):
         for src in srcs:
             if isinstance(src, str):
                 src_str = src
-            elif isinstance(src, types.FunctionType):
+            elif isinstance(src, (types.FunctionType, type)):
                 src_str = inspect.getsource(src)
             else:
+                # object instance
                 src_str = inspect.getsource(src.__class__)
             hasher.update(src_str.encode("utf-8"))
         return hasher.hexdigest()

From 3146519add735bc51a6a983af9e9c4a8b8d3373e Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 1 Aug 2025 15:37:55 +0100
Subject: [PATCH 30/54] [BugFix] Don't change title of top-level process
 (#22032)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/cli/serve.py         | 11 ++++++-----
 vllm/entrypoints/openai/api_server.py |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 0305354a66e85..9762a1de9edd3 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -18,7 +18,8 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
                                     show_filtered_argument_or_group_from_help)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, decorate_logs, get_tcp_uri
+from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri,
+                        set_process_title)
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor.abstract import Executor
@@ -74,7 +75,7 @@ def run_headless(args: argparse.Namespace):
 
     if args.api_server_count > 1:
         raise ValueError("api_server_count can't be set in headless mode")
-    # set_process_title("Headless_ProcManager")
+
     # Create the EngineConfig.
     engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
     usage_context = UsageContext.OPENAI_API_SERVER
@@ -139,8 +140,6 @@ def run_multi_api_server(args: argparse.Namespace):
 
     orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
 
-    # set_process_title("ProcManager")
-
     if num_api_servers > 1:
         setup_multiprocess_prometheus()
 
@@ -225,7 +224,9 @@ def run_api_server_worker_proc(listen_address,
                                **uvicorn_kwargs) -> None:
     """Entrypoint for individual API server worker processes."""
 
-    # Add process-specific prefix to stdout and stderr.
+    # Set process title and add process-specific prefix to stdout and stderr.
+    server_index = client_config.get("client_index", 0) if client_config else 0
+    set_process_title("APIServer", str(server_index))
     decorate_logs()
 
     uvloop.run(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 26db1357da4d0..1be03c57a1f1b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -102,7 +102,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
                         get_open_zmq_ipc_path, is_valid_ipv6_address,
-                        set_process_title, set_ulimit)
+                        set_ulimit)
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -1824,7 +1824,7 @@ async def run_server_worker(listen_address,
         ToolParserManager.import_tool_parser(args.tool_parser_plugin)
 
     server_index = client_config.get("client_index", 0) if client_config else 0
-    set_process_title("APIServer", str(server_index))
+
     # Load logging config for uvicorn if specified
     log_config = load_log_config(args.log_config_file)
     if log_config is not None:

From 97608dc276c292d9217eb6d334d969c5e89913c6 Mon Sep 17 00:00:00 2001
From: David Xia <david@davidxia.com>
Date: Fri, 1 Aug 2025 10:55:55 -0400
Subject: [PATCH 31/54] [Docs] use `uv` in CPU installation docs (#22089)

Signed-off-by: David Xia <david@davidxia.com>
---
 .../installation/cpu/apple.inc.md             | 12 ++---
 .../installation/cpu/build.inc.md             | 22 +++++----
 .../installation/cpu/s390x.inc.md             | 45 ++++++++++++-------
 3 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md
index 0816f38ac68a1..2828173a76a9a 100644
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
+vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
@@ -23,20 +23,20 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
+After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from source.
 
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements/cpu.txt
-pip install -e .
+uv pip install -r requirements/cpu.txt
+uv pip install -e .
 ```
 
 !!! note
-    On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
+    On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
 
 !!! example "Troubleshooting"
-    If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
+    If the build fails with errors like the following where standard C++ headers cannot be found, try to remove and reinstall your
     [Command Line Tools for Xcode](https://developer.apple.com/download/all/).
 
     ```text
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
index fa777fe0c8a1a..57a09e674a821 100644
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@@ -1,4 +1,4 @@
-First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
 
 ```bash
 sudo apt-get update  -y
@@ -6,28 +6,34 @@ sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certific
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
-Second, clone vLLM project:
+Second, clone the vLLM project:
 
 ```bash
 git clone https://github.com/vllm-project/vllm.git vllm_source
 cd vllm_source
 ```
 
-Third, install Python packages for vLLM CPU backend building:
+Third, install required dependencies:
 
 ```bash
-pip install --upgrade pip
-pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+uv pip install -r requirements/cpu-build.txt --torch-backend auto
+uv pip install -r requirements/cpu.txt --torch-backend auto
 ```
 
-Finally, build and install vLLM CPU backend:
+??? console "pip"
+    ```bash
+    pip install --upgrade pip
+    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+
+Finally, build and install vLLM:
 
 ```bash
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
 
-If you want to develop vllm, install it in editable mode instead.
+If you want to develop vLLM, install it in editable mode instead.
 
 ```bash
 VLLM_TARGET_DEVICE=cpu python setup.py develop
diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md
index acfb3396896bf..c1917267ce91b 100644
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform.
+vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
 
 Currently the CPU implementation for s390x architecture supports FP32 datatype only.
 
@@ -40,21 +40,32 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \
     . "$HOME/.cargo/env"
 ```
 
-Execute the following commands to build and install vLLM from the source.
+Execute the following commands to build and install vLLM from source.
 
 !!! tip
-    Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
+    Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
 
 ```bash
     sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
-    pip install -v \
-        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+    uv pip install -v \
+        --torch-backend auto \
         -r requirements-build.txt \
         -r requirements-cpu.txt \
     VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
-    pip install dist/*.whl
+        uv pip install dist/*.whl
 ```
 
+??? console "pip"
+    ```bash
+        sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
+        pip install -v \
+            --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+            -r requirements-build.txt \
+            -r requirements-cpu.txt \
+        VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+            pip install dist/*.whl
+    ```
+
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
@@ -63,19 +74,19 @@ Execute the following commands to build and install vLLM from the source.
 
 ```bash
 docker build -f docker/Dockerfile.s390x \
-        --tag vllm-cpu-env .
+    --tag vllm-cpu-env .
 
-# Launching OpenAI server
+# Launch OpenAI server
 docker run --rm \
-            --privileged=true \
-            --shm-size=4g \
-            -p 8000:8000 \
-            -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-            -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
-            vllm-cpu-env \
-            --model=meta-llama/Llama-3.2-1B-Instruct \
-            --dtype=float \
-            other vLLM OpenAI server arguments
+    --privileged true \
+    --shm-size 4g \
+    -p 8000:8000 \
+    -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+    -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+    vllm-cpu-env \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --dtype float \
+    other vLLM OpenAI server arguments
 ```
 
 # --8<-- [end:build-image-from-source]

From 2d7b09b998980b9ccbb3708632b47bc28de076aa Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 1 Aug 2025 17:16:37 +0100
Subject: [PATCH 32/54] Deprecate `--disable-log-requests` and replace with
 `--enable-log-requests` (#21739)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/nightly-benchmarks/README.md       |  1 -
 .../tests/genai-perf-tests.json               |  1 -
 .../tests/nightly-tests.json                  |  6 ----
 .../tests/serving-tests-cpu-snc2.json         |  6 ----
 .../tests/serving-tests-cpu-snc3.json         |  6 ----
 .../tests/serving-tests-cpu.json              |  5 ----
 .../tests/serving-tests.json                  |  6 +---
 tests/config/test_mp_reducer.py               |  1 -
 tests/mq_llm_engine/test_load.py              |  2 +-
 tests/v1/engine/test_async_llm.py             |  4 +--
 tests/v1/test_async_llm_dp.py                 |  1 -
 vllm/engine/arg_utils.py                      | 30 ++++++++++++++++---
 vllm/engine/async_llm_engine.py               | 26 +++++++++-------
 vllm/engine/multiprocessing/engine.py         | 27 ++++++++++++-----
 vllm/entrypoints/openai/api_server.py         | 12 ++++----
 vllm/entrypoints/openai/run_batch.py          |  6 ++--
 vllm/utils/__init__.py                        |  6 ++++
 vllm/v1/engine/async_llm.py                   | 30 +++++++++++--------
 18 files changed, 97 insertions(+), 79 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index fcde284efea98..3721d3d1d6749 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -104,7 +104,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
index edbe9f2df0ce0..f26ae7634f3d9 100644
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -11,7 +11,6 @@
         },
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index fda1a7a3ec53c..41b4a4008801d 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -35,7 +35,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -90,7 +89,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -145,7 +143,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -197,7 +194,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -251,7 +247,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
@@ -305,7 +300,6 @@
         }, 
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
             "num_scheduler_steps": 10,
             "max_num_seqs": 512,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index a144b4420fbf1..dd0e24edff98d 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -17,7 +17,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -50,7 +49,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -83,7 +81,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -117,7 +114,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -153,7 +149,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -189,7 +184,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index e6e69b63b74df..f1bda65a7590b 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -17,7 +17,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -50,7 +49,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -84,7 +82,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -118,7 +115,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -154,7 +150,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -191,7 +186,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index ce1f924de387f..f150b9abeea45 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -17,7 +17,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -50,7 +49,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -83,7 +81,6 @@
 	    "block_size": 128,
 	    "trust_remote_code": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -117,7 +114,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
@@ -153,7 +149,6 @@
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
             "disable_log_stats": "",
-            "disable_log_requests": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 13fd5aa8db97b..a6d4141d5c2dc 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -7,7 +7,6 @@
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -26,7 +25,6 @@
             "tensor_parallel_size": 4,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -45,7 +43,6 @@
             "tensor_parallel_size": 2,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -60,8 +57,7 @@
         "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
         "qps_list": [2],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "disable_log_requests": "", 
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
             "tensor_parallel_size": 4,
             "swap_space": 16,
             "speculative_config": {
diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py
index ee351cbfa7c16..d4d4be293280b 100644
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@@ -28,7 +28,6 @@ def test_mp_reducer(monkeypatch):
             max_model_len=32,
             gpu_memory_utilization=0.1,
             disable_log_stats=True,
-            disable_log_requests=True,
         )
 
         async_llm = AsyncLLM.from_engine_args(
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index e9fd5b814f285..c934706611ae3 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -16,7 +16,7 @@ NUM_EXPECTED_TOKENS = 10
 NUM_REQUESTS = 10000
 
 # Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 412df3acff126..21694491dd73a 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -26,12 +26,10 @@ if not current_platform.is_cuda():
 TEXT_ENGINE_ARGS = AsyncEngineArgs(
     model="meta-llama/Llama-3.2-1B-Instruct",
     enforce_eager=True,
-    disable_log_requests=True,
 )
 
 VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
-                                     enforce_eager=True,
-                                     disable_log_requests=True)
+                                     enforce_eager=True)
 
 TEXT_PROMPT = "Hello my name is Robert and"
 
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index 6716d27f571f9..c2610a87ac780 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -25,7 +25,6 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
 engine_args = AsyncEngineArgs(
     model="ibm-research/PowerMoE-3b",
     enforce_eager=True,
-    disable_log_requests=True,
     tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
     data_parallel_size=DP_SIZE,
 )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f938f19b90469..0d38b5b5302c1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
 import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
-from typing_extensions import TypeIs
+from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
@@ -1704,7 +1704,23 @@ class EngineArgs:
 @dataclass
 class AsyncEngineArgs(EngineArgs):
     """Arguments for asynchronous vLLM engine."""
-    disable_log_requests: bool = False
+    enable_log_requests: bool = False
+
+    @property
+    @deprecated(
+        "`disable_log_requests` is deprecated and has been replaced with "
+        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
+        "`enable_log_requests` instead.")
+    def disable_log_requests(self) -> bool:
+        return not self.enable_log_requests
+
+    @disable_log_requests.setter
+    @deprecated(
+        "`disable_log_requests` is deprecated and has been replaced with "
+        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
+        "`enable_log_requests` instead.")
+    def disable_log_requests(self, value: bool):
+        self.enable_log_requests = not value
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser,
@@ -1715,9 +1731,15 @@ class AsyncEngineArgs(EngineArgs):
         load_general_plugins()
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
+        parser.add_argument('--enable-log-requests',
+                            action=argparse.BooleanOptionalAction,
+                            default=AsyncEngineArgs.enable_log_requests,
+                            help='Enable logging requests.')
         parser.add_argument('--disable-log-requests',
-                            action='store_true',
-                            help='Disable logging requests.')
+                            action=argparse.BooleanOptionalAction,
+                            default=not AsyncEngineArgs.enable_log_requests,
+                            help='[DEPRECATED] Disable logging requests.',
+                            deprecated=True)
         current_platform.pre_register_and_update(parser)
         return parser
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 06bb4eeab69eb..1f962b008ee03 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -30,7 +30,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, weak_bind
+from vllm.utils import Device, deprecate_kwargs, weak_bind
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -554,14 +554,20 @@ class AsyncLLMEngine(EngineClient):
         return LLMEngine._get_executor_cls(engine_config)
 
     @classmethod
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. "
+                            "Use `enable_log_requests` instead."),
+    )
     def from_vllm_config(
-        cls,
-        vllm_config: VllmConfig,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-        disable_log_requests: bool = False,
-        disable_log_stats: bool = False,
+            cls,
+            vllm_config: VllmConfig,
+            start_engine_loop: bool = True,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+            enable_log_requests: bool = False,
+            disable_log_stats: bool = False,
+            disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLMEngine":
         """Create an AsyncLLMEngine from the EngineArgs."""
 
@@ -569,7 +575,7 @@ class AsyncLLMEngine(EngineClient):
             vllm_config=vllm_config,
             executor_class=cls._get_executor_cls(vllm_config),
             start_engine_loop=start_engine_loop,
-            log_requests=not disable_log_requests,
+            log_requests=enable_log_requests,
             log_stats=not disable_log_stats,
             usage_context=usage_context,
             stat_loggers=stat_loggers,
@@ -598,7 +604,7 @@ class AsyncLLMEngine(EngineClient):
             usage_context=usage_context,
             stat_loggers=stat_loggers,
             disable_log_stats=engine_args.disable_log_stats,
-            disable_log_requests=engine_args.disable_log_requests,
+            enable_log_requests=engine_args.enable_log_requests,
         )
 
     @property
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index fe6eb0d8c2f1a..903f3fd71ebcd 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -34,6 +34,7 @@ from vllm.outputs import RequestOutput
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import deprecate_kwargs
 from vllm.worker.model_runner_base import InputProcessingError
 
 logger = init_logger(__name__)
@@ -120,10 +121,20 @@ class MQLLMEngine:
             return ENGINE_DEAD_ERROR()
 
     @classmethod
-    def from_vllm_config(cls, vllm_config: VllmConfig,
-                         usage_context: UsageContext,
-                         disable_log_requests: bool, disable_log_stats: bool,
-                         ipc_path: str) -> "MQLLMEngine":
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. "
+                            "Use `enable_log_requests` instead."),
+    )
+    def from_vllm_config(
+            cls,
+            vllm_config: VllmConfig,
+            usage_context: UsageContext,
+            enable_log_requests: bool,
+            disable_log_stats: bool,
+            ipc_path: str,
+            disable_log_requests: bool = True,  # Deprecated, will be removed
+    ) -> "MQLLMEngine":
         # Setup plugins for each process
         from vllm.plugins import load_general_plugins
         load_general_plugins()
@@ -136,7 +147,7 @@ class MQLLMEngine:
             ipc_path=ipc_path,
             usage_context=usage_context,
             use_async_sockets=use_async_sockets,
-            log_requests=(not disable_log_requests),
+            log_requests=enable_log_requests,
             log_stats=(not disable_log_stats),
         )
 
@@ -150,7 +161,7 @@ class MQLLMEngine:
             ipc_path=ipc_path,
             vllm_config=vllm_config,
             usage_context=usage_context,
-            disable_log_requests=engine_args.disable_log_requests,
+            enable_log_requests=engine_args.enable_log_requests,
             disable_log_stats=engine_args.disable_log_stats,
         )
 
@@ -436,7 +447,7 @@ def signal_handler(*_) -> None:
 
 def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
                   ipc_path: str, disable_log_stats: bool,
-                  disable_log_requests: bool, engine_alive):
+                  enable_log_requests: bool, engine_alive):
     try:
         # Ensure we can serialize transformer config before spawning
         maybe_register_config_serialize_by_value()
@@ -445,7 +456,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
             vllm_config=vllm_config,
             usage_context=usage_context,
             disable_log_stats=disable_log_stats,
-            disable_log_requests=disable_log_requests,
+            enable_log_requests=enable_log_requests,
             ipc_path=ipc_path)
 
         signal.signal(signal.SIGTERM, signal_handler)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1be03c57a1f1b..b8ec5461f7719 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -205,7 +205,7 @@ async def build_async_engine_client_from_engine_args(
             async_llm = AsyncLLM.from_vllm_config(
                 vllm_config=vllm_config,
                 usage_context=usage_context,
-                disable_log_requests=engine_args.disable_log_requests,
+                enable_log_requests=engine_args.enable_log_requests,
                 disable_log_stats=engine_args.disable_log_stats,
                 client_addresses=client_config,
                 client_index=client_index)
@@ -227,7 +227,7 @@ async def build_async_engine_client_from_engine_args(
             engine_client = AsyncLLMEngine.from_vllm_config(
                 vllm_config=vllm_config,
                 usage_context=usage_context,
-                disable_log_requests=engine_args.disable_log_requests,
+                enable_log_requests=engine_args.enable_log_requests,
                 disable_log_stats=engine_args.disable_log_stats)
             yield engine_client
         finally:
@@ -272,7 +272,7 @@ async def build_async_engine_client_from_engine_args(
             target=run_mp_engine,
             args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
                   engine_args.disable_log_stats,
-                  engine_args.disable_log_requests, engine_alive))
+                  engine_args.enable_log_requests, engine_alive))
         engine_process.start()
         engine_pid = engine_process.pid
         assert engine_pid is not None, "Engine process failed to start."
@@ -1570,10 +1570,10 @@ async def init_app_state(
     else:
         served_model_names = [args.model]
 
-    if args.disable_log_requests:
-        request_logger = None
-    else:
+    if args.enable_log_requests:
         request_logger = RequestLogger(max_log_len=args.max_log_len)
+    else:
+        request_logger = None
 
     base_model_paths = [
         BaseModelPath(name=name, model_path=args.model)
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 137b368dad202..d146ad485d194 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -324,10 +324,10 @@ async def run_batch(
     else:
         served_model_names = [args.model]
 
-    if args.disable_log_requests:
-        request_logger = None
-    else:
+    if args.enable_log_requests:
         request_logger = RequestLogger(max_log_len=args.max_log_len)
+    else:
+        request_logger = None
 
     base_model_paths = [
         BaseModelPath(name=name, model_path=args.model)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index d5d8d9dad73a8..7405f3986df8d 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1668,6 +1668,12 @@ class FlexibleArgumentParser(ArgumentParser):
         # Enable the deprecated kwarg for Python 3.12 and below
 
         def parse_known_args(self, args=None, namespace=None):
+            if args is not None and "--disable-log-requests" in args:
+                # Special case warning because the warning below won't trigger
+                # if –-disable-log-requests because its value is default.
+                logger.warning_once(
+                    "argument '--disable-log-requests' is deprecated. This "
+                    "will be removed in v0.12.0.")
             namespace, args = super().parse_known_args(args, namespace)
             for action in FlexibleArgumentParser._deprecated:
                 if (hasattr(namespace, dest := action.dest)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ed0d9620f4762..308ca32105ba9 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -27,7 +27,7 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cdiv
+from vllm.utils import Device, cdiv, deprecate_kwargs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
@@ -142,16 +142,22 @@ class AsyncLLM(EngineClient):
             pass
 
     @classmethod
+    @deprecate_kwargs(
+        "disable_log_requests",
+        additional_message=("This argument will have no effect. "
+                            "Use `enable_log_requests` instead."),
+    )
     def from_vllm_config(
-        cls,
-        vllm_config: VllmConfig,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
-        disable_log_requests: bool = False,
-        disable_log_stats: bool = False,
-        client_addresses: Optional[dict[str, str]] = None,
-        client_index: int = 0,
+            cls,
+            vllm_config: VllmConfig,
+            start_engine_loop: bool = True,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            stat_loggers: Optional[list[StatLoggerFactory]] = None,
+            enable_log_requests: bool = False,
+            disable_log_stats: bool = False,
+            client_addresses: Optional[dict[str, str]] = None,
+            client_index: int = 0,
+            disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
         if not envs.VLLM_USE_V1:
             raise ValueError(
@@ -166,7 +172,7 @@ class AsyncLLM(EngineClient):
             executor_class=Executor.get_class(vllm_config),
             start_engine_loop=start_engine_loop,
             stat_loggers=stat_loggers,
-            log_requests=not disable_log_requests,
+            log_requests=enable_log_requests,
             log_stats=not disable_log_stats,
             usage_context=usage_context,
             client_addresses=client_addresses,
@@ -191,7 +197,7 @@ class AsyncLLM(EngineClient):
         return cls(
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_requests=not engine_args.disable_log_requests,
+            log_requests=engine_args.enable_log_requests,
             log_stats=not engine_args.disable_log_stats,
             start_engine_loop=start_engine_loop,
             usage_context=usage_context,

From 326a1b001db10afc2dc5b2bfcb60a3b8f8bcb2ac Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 1 Aug 2025 17:32:27 +0100
Subject: [PATCH 33/54] Improve documentation of
 `ModelConfig.try_get_generation_config` to prevent future confusion (#21526)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 2d61552c5dadc..124d62b699771 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1575,7 +1575,18 @@ class ModelConfig:
         return self.multimodal_config
 
     def try_get_generation_config(self) -> dict[str, Any]:
-        if self.generation_config in ("auto", "vllm"):
+        """
+        This method attempts to retrieve the non-default values of the
+        generation config for this model.
+        
+        The generation config can contain information about special tokens, as
+        well as sampling parameters. Which is why this method exists separately
+        to `get_diff_sampling_param`.
+
+        Returns:
+            A dictionary containing the non-default generation config.
+        """
+        if self.generation_config in {"auto", "vllm"}:
             config = try_get_generation_config(
                 self.hf_config_path or self.model,
                 trust_remote_code=self.trust_remote_code,
@@ -1594,13 +1605,18 @@ class ModelConfig:
 
     def get_diff_sampling_param(self) -> dict[str, Any]:
         """
-        This method returns a dictionary containing the parameters
-        that differ from the default sampling parameters. If
-        `generation_config` is `"vllm"`, an empty dictionary is returned.
+        This method returns a dictionary containing the non-default sampling
+        parameters with `override_generation_config` applied.
+
+        The default sampling parameters are:
+
+        - vLLM's neutral defaults if `self.generation_config="vllm"`
+        - the model's defaults if `self.generation_config="auto"`
+        - as defined in `generation_config.json` if
+            `self.generation_config="path/to/generation_config/dir"`
 
         Returns:
-            dict[str, Any]: A dictionary with the differing sampling
-            parameters, if `generation_config` is `"vllm"` an empty dictionary.
+            A dictionary containing the non-default sampling parameters.
         """
         if self.generation_config == "vllm":
             config = {}

From 3f8e9521791dd3f41c90cc2b3c9e78a1951f5237 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 2 Aug 2025 00:33:30 +0800
Subject: [PATCH 34/54] [Bugfix] Fix glm4.1v video inference issue (#22067)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../multimodal/processing/test_glm4_1v.py     | 51 +++++++++++++++++++
 vllm/model_executor/models/glm4_1v.py         |  8 +--
 2 files changed, 53 insertions(+), 6 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_glm4_1v.py

diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
new file mode 100644
index 0000000000000..d1c5fa8fec6d2
--- /dev/null
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.video import VideoAsset
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("expected_toks_per_frame", [299])
+@pytest.mark.parametrize("num_frames", [32, 128])
+@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
+def test_processor_override(
+    model_id: str,
+    expected_toks_per_frame: int,
+    expected_grid_t: int,
+    fps: int,
+    num_frames: int,
+):
+    """Ensure GLM4vMultiModalProcessor can handle video frames properly."""
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video, metadata = video_assets.np_ndarrays, video_assets.metadata
+    metadata["fps"] = fps
+    mm_data = {"video": [(video, metadata)]}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
+    video_tok_count = processed_inputs["prompt_token_ids"].count(
+        video_token_id)
+    grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
+
+    assert grid_t == expected_grid_t
+    assert video_tok_count == expected_toks_per_frame * grid_t
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 5f306f05d140e..7c9840790fe3e 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -937,7 +937,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
                               total_frames: int) -> list[int]:
         video_processor = self.get_video_processor()
 
-        video_fps = metadata.get("fps", 2.0)
+        video_fps = metadata.get("fps", video_processor.fps)
         meta_frames = metadata.get("total_num_frames", total_frames)
         max_frame_idx = meta_frames - 1
         duration = metadata.get("duration",
@@ -1120,11 +1120,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                     video_placeholder,
                 )
 
-                grid_t = len(video_outputs["video_grid_thw"])
-                _, grid_h, grid_w = video_outputs["video_grid_thw"][0]
-                grid_thw = torch.tensor([[grid_t, grid_h, grid_w]])
-
-                video_grid_thw_lst.append(grid_thw)
+                video_grid_thw_lst.append(video_outputs["video_grid_thw"])
                 pixel_values_videos_lst.append(
                     video_outputs["pixel_values_videos"])
             video_outputs = dict(

From b879ecd6e2636b6af893052615693a51466381ec Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Sat, 2 Aug 2025 01:09:36 +0800
Subject: [PATCH 35/54] [Bugfix] fix when skip tokenizer init (#21922)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 tests/v1/engine/test_llm_engine.py | 26 ++++++++++++++++++++++++++
 vllm/v1/engine/processor.py        |  9 +++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index f37686317fd14..2848420c22085 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -213,3 +213,29 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
         assert len(num_accepted_tokens_per_pos) == 1
         assert isinstance(num_accepted_tokens_per_pos[0], Vector)
         assert len(num_accepted_tokens_per_pos[0].values) == 5
+
+
+@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
+def test_skip_tokenizer_initialization(model: str,
+                                       monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(
+        model=model,
+        skip_tokenizer_init=True,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
+        llm.generate("abc", sampling_params)
+
+    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 0f2f404a130ef..224acc47feb27 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -89,6 +89,10 @@ class Processor:
             return
         if not params.allowed_token_ids:
             raise ValueError("allowed_token_ids is not None and empty!")
+        if self.tokenizer is None:
+            # When skip_tokenizer_init=True, we can't validate token IDs
+            # Skip validation and let the model handle invalid tokens
+            return
         tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
         vocab_size = len(tokenizer)
         if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
@@ -283,8 +287,9 @@ class Processor:
                     len(decoder_inputs["prompt_token_ids"]))
             sampling_params.update_from_generation_config(
                 self.generation_config_fields, eos_token_id)
-            sampling_params.update_from_tokenizer(
-                self.tokenizer.get_lora_tokenizer(lora_request))
+            if self.tokenizer is not None:
+                sampling_params.update_from_tokenizer(
+                    self.tokenizer.get_lora_tokenizer(lora_request))
         else:
             pooling_params = params.clone()
 

From d6664664b442cb236f8541a126e4076a5e12c56d Mon Sep 17 00:00:00 2001
From: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Date: Fri, 1 Aug 2025 21:09:49 +0400
Subject: [PATCH 36/54] security policy: take 1 (#21119)

Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 SECURITY.md | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/SECURITY.md b/SECURITY.md
index 6053cfb41f35b..4f338557da1a0 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,13 +1,41 @@
 # Security Policy
 
-## Reporting a Vulnerability
+## Reporting security issues:
 
-If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
 
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
+## Issue triage
 
----
+Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
+
+## Threat model
 
 Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
 
 Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
+
+## Issue severity
+
+We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
+
+### CRITICAL Severity
+Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS  ≥ 9.0.
+
+### HIGH Severity
+Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
+
+### MODERATE Severity
+Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
+
+### LOW Severity
+Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
+
+## Prenotification policy
+
+For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
+
+* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
+
+* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
+
+* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.

From ac45c44d98e77f30e47b8fb69134f4635183070d Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Fri, 1 Aug 2025 22:44:38 +0530
Subject: [PATCH 37/54] [Bugfix] [Performance] DeepEPHighThroughput + DeepSeek
 : Quant before Dispatch (#21837)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 .../layers/fused_moe/deepep_ht_prepare_finalize.py  | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 7016ff34c3a85..f6b62254e7b4c 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -144,12 +144,13 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * topk_weights.to(a1.dtype)
 
-        if quant_config.per_act_token_quant:
+        if quant_config.is_block_quantized:
+            # Quant and Dispatch
             a1q, a1q_scale = moe_kernel_quantize_input(
                 a1,
                 a1_scale,
                 quant_dtype=quant_config.quant_dtype,
-                per_act_token_quant=True,
+                per_act_token_quant=quant_config.per_act_token_quant,
                 block_shape=quant_config.block_shape,
             )
             if a1q_scale is not None and a1q_scale.numel() == 1:
@@ -162,8 +163,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  rank_topk_weights=topk_weights,
                  num_experts=num_experts)
         else:
-            # DeepEP kernels only support dispatching per-token-quant
-            # quantization. dispatch in bfloat16.
+            # Dispatch and Quant
+            # DeepEP kernels only support dispatching block-quantized
+            # activation scales.
+            # Dispatch in bfloat16
             (expert_x, _, expert_tokens_meta, expert_topk_ids,
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1,
@@ -171,7 +174,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                  rank_topk_ids=topk_ids,
                  rank_topk_weights=topk_weights,
                  num_experts=num_experts)
-            # quantize now
+            # Quantize after dispatch.
             expert_x_scale = None
             if expert_x.numel() != 0:
                 expert_x, expert_x_scale = moe_kernel_quantize_input(

From 38c8bce8b652df87d111c04ddf849c38615000c7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 1 Aug 2025 18:31:29 +0100
Subject: [PATCH 38/54] Enable headless models for pooling in the Transformers
 backend (#21767)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/models/registry.py                   |  1 +
 tests/models/test_transformers.py          | 28 +++++++++++++++++-----
 vllm/config.py                             |  9 +++++--
 vllm/model_executor/models/registry.py     |  3 ++-
 vllm/model_executor/models/transformers.py | 12 ++++++++++
 5 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 806342a57dfab..fdc7888c85efb 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -525,6 +525,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
+    "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
     "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
     "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
 }
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 5b7d90dfb896d..66ff8f7a54d31 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -34,8 +34,7 @@ def check_implementation(
 
     with runner_test(model, **kwargs_test, **kwargs) as model_test:
         model_config = model_test.llm.llm_engine.model_config
-        assert model_config.architecture == (
-            model_config._get_transformers_backend_cls())
+        assert model_config.using_transformers_backend()
 
         outputs_test = model_test.generate_greedy_logprobs(*args)
 
@@ -135,8 +134,7 @@ def test_quantization(
             enforce_eager=True,
             **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
         model_config = vllm_model.llm.llm_engine.model_config
-        assert model_config.architecture == (
-            model_config._get_transformers_backend_cls())
+        assert model_config.using_transformers_backend()
 
         transformers_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
@@ -149,6 +147,25 @@ def test_quantization(
     )
 
 
+@pytest.mark.parametrize(
+    "model",
+    [
+        # Layers live in `layers`
+        "Qwen/Qwen3-Embedding-0.6B",
+        # Layers live in `model.layers`
+        "meta-llama/Llama-3.2-1B-Instruct"
+    ],
+)
+def test_embed_loading(vllm_runner, model):
+    with vllm_runner(model,
+                     max_model_len=1024,
+                     enforce_eager=True,
+                     runner="pooling",
+                     model_impl="transformers") as model_test:
+        model_config = model_test.llm.llm_engine.model_config
+        assert model_config.using_transformers_backend()
+
+
 @pytest.mark.parametrize(
     "model",
     ["jason9693/Qwen2.5-1.5B-apeach"],
@@ -169,8 +186,7 @@ def test_classify(
                      dtype=dtype,
                      model_impl="transformers") as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
-        assert model_config.architecture == (
-            model_config._get_transformers_backend_cls())
+        assert model_config.using_transformers_backend()
 
         vllm_outputs = vllm_model.classify(example_prompts)
 
diff --git a/vllm/config.py b/vllm/config.py
index 124d62b699771..dabb4b524dfd8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -812,12 +812,17 @@ class ModelConfig:
     def _get_transformers_backend_cls(self) -> str:
         """Determine which Transformers backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
+        if getattr(self, "runner_type", self.runner) == "pooling":
+            return "TransformersModel"
         if self.hf_config != self.hf_text_config:
             # If 'hf_text_config' is the same as 'hf_config'. If not, it is
             # probably a composite config, i.e. multimodal
             return "TransformersForMultimodalLM"
-        else:
-            return "TransformersForCausalLM"
+        return "TransformersForCausalLM"
+
+    def using_transformers_backend(self) -> bool:
+        """Check if the model is using the Transformers backend class."""
+        return self.architecture == self._get_transformers_backend_cls()
 
     @property
     def registry(self):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 848c04b9b32f7..0c5d87a7dc472 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -270,8 +270,9 @@ _TRANSFORMERS_SUPPORTED_MODELS = {
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
-    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
+    "TransformersModel": ("transformers", "TransformersModel"),
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
+    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
 }
 # yapf: enable
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index e67548800c354..5059d1e1d9fea 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -651,6 +651,18 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
+@support_torch_compile
+class TransformersModel(TransformersBase):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # Add `model.` prefix for base model checkpoints
+            "": "model.",
+            # Remove `model.` from places it should not be
+            "model.model.": "model.",
+            "model.score": "score",
+        })
+
+
 @support_torch_compile
 class TransformersForCausalLM(TransformersBase):
 

From 8d705996dffbb2299750b7b2b50bbcd5ccb4a5ad Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 2 Aug 2025 01:35:30 +0800
Subject: [PATCH 39/54] [Misc] Minor enhancement of benchmark_moe (#22068)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c350aaf5d3ad2..72250e2fb6d2b 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -22,6 +22,13 @@ from vllm.utils import FlexibleArgumentParser
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, (
+        "intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
+    )
+
+
 class BenchmarkConfig(TypedDict):
     BLOCK_SIZE_M: int
     BLOCK_SIZE_N: int
@@ -603,7 +610,7 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-
+    ensure_divisibility(intermediate_size, args.tp_size)
     hidden_size = config.hidden_size
     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"

From 3277e8f9e19c396d6dd92a0901d2e3f8fb8982d4 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 1 Aug 2025 13:36:07 -0400
Subject: [PATCH 40/54] Fix pre-commit failure for SECURTIY.md (#22102)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 SECURITY.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/SECURITY.md b/SECURITY.md
index 4f338557da1a0..414669fb3712e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,6 +1,6 @@
 # Security Policy
 
-## Reporting security issues:
+## Reporting security issues
 
 Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
 
@@ -19,15 +19,19 @@ Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/m
 We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
 
 ### CRITICAL Severity
+
 Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS  ≥ 9.0.
 
 ### HIGH Severity
+
 Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
 
 ### MODERATE Severity
+
 Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
 
 ### LOW Severity
+
 Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
 
 ## Prenotification policy

From 9659bc7f271ec640da780b5ca739e261764b954b Mon Sep 17 00:00:00 2001
From: Animesh Jain <jainanimesh2305@yahoo.com>
Date: Fri, 1 Aug 2025 10:38:52 -0700
Subject: [PATCH 41/54] [compile][startup] Disable C++ compilation of symbolic
 shapes (#20836)

Signed-off-by: Animesh Jain <anijain@umich.edu>
---
 vllm/compilation/decorators.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 1370862d580a5..0d2c432497c40 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -267,8 +267,15 @@ def _support_torch_compile(
                     code.co_filename)
                 return inline_call(parent, func, args, kwargs)
 
-            with patch.object(InliningInstructionTranslator, 'inline_call',
-                              patched_inline_call):
+            # Disable the C++ compilation of symbolic shape guards. C++-fication
+            # of symbolic shape guards can improve guard overhead. But, since
+            # vllm skip guards anyways, setting this flag to False can improve
+            # compile time.
+            with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards",
+                                            False), patch.object(
+                                                InliningInstructionTranslator,
+                                                'inline_call',
+                                                patched_inline_call):
                 output = self.compiled_callable(*args, **kwargs)
             return output
 

From d331759488eb7627d2454549eeb01d14f83f1c41 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 1 Aug 2025 11:50:58 -0700
Subject: [PATCH 42/54] Introduce RayPPCommunicator for ray-based PP (#21660)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 .../device_communicators/ray_communicator.py  | 257 ++++++++++++++++++
 vllm/envs.py                                  |   8 +
 vllm/executor/ray_distributed_executor.py     |  15 +
 3 files changed, 280 insertions(+)
 create mode 100644 vllm/distributed/device_communicators/ray_communicator.py

diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py
new file mode 100644
index 0000000000000..e5ba297ebcc1b
--- /dev/null
+++ b/vllm/distributed/device_communicators/ray_communicator.py
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import uuid
+from typing import Any, Optional
+
+import ray
+import torch
+from ray.exceptions import RayChannelError
+from ray.experimental.channel.communicator import (Communicator,
+                                                   TorchTensorAllocator)
+from torch.distributed import ReduceOp
+
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase)
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.utils import current_stream
+
+logger = init_logger(__name__)
+
+
+class RayPPCommunicator(Communicator):
+    """
+    Communicator to be used for pipeline parallelism in Ray Compiled Graph.
+    This is wraps around the vLLM _PP GroupCoordinator.
+
+    This class is not thread-safe.
+    """
+
+    _comm: Optional[DeviceCommunicatorBase]
+
+    def __init__(
+        self,
+        world_size: int,
+        comm_id: Any,
+        rank: Optional[int],
+        actor_handles: list["ray.actor.ActorHandle"],
+        cuda_stream: Optional[torch.cuda.Stream],
+        use_communication_streams: bool = False,
+    ):
+        """
+        Initialize a RayPPCommunicator that can be used to communicate with
+        other Ray Compiled Graph actors for pipeline parallelism.
+
+        Args:
+            world_size: The number of participating actors.
+            comm_id: A unique communicator ID. This is just to conform with
+                the Ray Communicator API and is not used.
+            rank: The rank of this actor. If None, then the caller is not a
+                participant of the RayPPCommunicator group (e.g., the Ray
+                driver).
+            actor_handles: A list of actor handles.
+            cuda_stream: A CUDA stream to dispatch communication ops to. This
+                is not supported.
+            use_communication_streams: Whether to use communication streams.
+                This is not supported.
+        """
+        self._world_size = world_size
+        self._rank: Optional[int] = None
+        self._actor_handles = actor_handles
+        if use_communication_streams:
+            raise NotImplementedError(
+                "use_communication_streams is not supported")
+        if cuda_stream is not None and cuda_stream != current_stream():
+            raise ValueError(
+                "cuda_stream other than the current stream is not supported")
+
+        if rank is not None:
+            # Rank is not None, this is Ray worker
+            assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned"
+
+            self._comm = get_pp_group().device_communicator
+
+            # Since we wrap around the vLLM _PP communicator, we use
+            # the rank from the vLLM communicator, and ignore the rank
+            # passed in from Ray.
+            # TODO(rui): refactor the Ray Communicator API so that
+            # it also supports no rank passed in.
+            self._rank = self._comm.rank_in_group
+
+            self._build_actor_rank_mapping()
+        else:
+            # Rank is None, this is Ray driver
+            self._comm = None
+
+        self._closed = False
+
+    def _build_actor_rank_mapping(self):
+        """
+        Use collective communication to build a mapping from actor IDs to ranks.
+        This should be called once during initialization.
+        """
+        if self._comm is None:
+            return {}
+
+        current_actor = ray.get_runtime_context().current_actor
+        actor_id_str = current_actor._actor_id.hex()
+
+        # Ray actor IDs are 32-character hex strings (128 bits)
+        ACTOR_ID_LEN = 32
+        actor_id_bytes = actor_id_str.encode('utf-8')
+        assert len(
+            actor_id_bytes
+        ) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}"
+
+        actor_id_tensor = torch.frombuffer(
+            actor_id_bytes, dtype=torch.uint8).to(self._comm.device)
+
+        # All-gather full actor IDs from all actors
+        gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0)
+
+        # Build mapping: actor_id -> device_comm_rank
+        self._actor_id_to_rank = {}
+        for rank in range(self._world_size):
+            start_idx = rank * ACTOR_ID_LEN
+            end_idx = (rank + 1) * ACTOR_ID_LEN
+            actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy(
+            ).tobytes()
+            actor_id = actor_bytes.decode('utf-8')
+            self._actor_id_to_rank[actor_id] = rank
+
+    def initialize(self, rank: int) -> None:
+        # No additional initialization is needed.
+        pass
+
+    def get_actor_handles(self) -> list["ray.actor.ActorHandle"]:
+        return self._actor_handles
+
+    def get_rank(self, actor: ray.actor.ActorHandle) -> int:
+        """
+        Return the given actor's rank using device communicator collective ops.
+        """
+        assert hasattr(self, '_actor_id_to_rank'), (
+            "Actor rank mapping not built. "
+            "This should have been done during initialization.")
+
+        actor_id_str = actor._actor_id.hex()
+
+        if actor_id_str in self._actor_id_to_rank:
+            return self._actor_id_to_rank[actor_id_str]  # type: ignore
+        else:
+            raise ValueError(f"Actor {actor} not found in communicator group")
+
+    def get_self_rank(self) -> Optional[int]:
+        """
+        Return this actor's rank.
+        """
+        return self._rank
+
+    def get_world_size(self) -> int:
+        """
+        Return the number of ranks in the RayPPCommunicator group.
+        """
+        return self._world_size
+
+    def send(self, buf: "torch.Tensor", peer_rank: int) -> None:
+        """
+        Send a torch.Tensor to a peer.
+
+        This returns when the send kernel has been queued, but the kernel may
+        not have completed. Therefore, the caller should ensure that there are
+        no concurrent writes to the sent `buf` until the send has finished.
+        That is, either all writes should be submitted on the current stream
+        (self._cuda_stream) or, if on a different stream, that stream should
+        synchronize with the current stream.
+
+        Args:
+            buf: The torch.Tensor to send. It should already be on this
+                actor's default device.
+            peer_rank: The rank of the actor to send to.
+        """
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+
+        assert self._comm is not None
+        self._comm.send(buf, peer_rank)
+
+    def recv(
+        self,
+        shape: tuple[int],
+        dtype: "torch.dtype",
+        peer_rank: int,
+        allocator: TorchTensorAllocator,
+    ) -> "torch.Tensor":
+        """
+        Receive a torch.Tensor from a peer and synchronize the current stream.
+
+        After this call returns, the receive buffer is safe to read from from
+        any stream. An RayChannelError will be raised if an error occurred
+        (e.g., remote actor died), and the buffer is not safe to read.
+
+        Args:
+            shape: The shape of the tensor to receive.
+            dtype: The dtype of the tensor to receive.
+            peer_rank: The rank of the actor to receive from.
+            allocator: The allocator to use to create the received tensor.
+                This is ignored for this implementation.
+        """
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+
+        assert self._comm is not None
+        size = torch.Size(shape)
+        buf = self._comm.recv(size, dtype, src=peer_rank)
+
+        # Buffer values are undefined if NCCL ops are aborted. Therefore, we
+        # need to synchronize here and check that the channel is still
+        # open to ensure that the receive buffer is valid.
+        # TODO(swang): Avoid CUDA synchronization.
+        current_stream().synchronize()
+
+        if self._closed:
+            raise RayChannelError("RayPPCommunicator has been destroyed.")
+        return buf
+
+    def allgather(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+    ):
+        raise NotImplementedError("allgather is not supported")
+
+    def allreduce(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+        op: ReduceOp = ReduceOp.SUM,
+    ):
+        raise NotImplementedError("allreduce is not supported")
+
+    def reducescatter(
+        self,
+        send_buf: "torch.Tensor",
+        recv_buf: "torch.Tensor",
+        op: ReduceOp = ReduceOp.SUM,
+    ):
+        raise NotImplementedError("reducescatter is not supported")
+
+    @property
+    def recv_stream(self):
+        return torch.cuda.StreamContext(current_stream())
+
+    @property
+    def send_stream(self):
+        return torch.cuda.StreamContext(current_stream())
+
+    def destroy(self) -> None:
+        # Just sets a flag, vLLM manages the lifecycle of the underlying
+        # _PP GroupCoordinator.
+        self._closed = True
+
+    def get_transport_name(self) -> str:
+        return "nccl"
+
+    @classmethod
+    def generate_communicator_id(cls) -> Any:
+        return uuid.uuid4()
diff --git a/vllm/envs.py b/vllm/envs.py
index 7553eccf16ea9..2fda2903179b5 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -55,6 +55,7 @@ if TYPE_CHECKING:
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
+    VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
     VLLM_XLA_USE_SPMD: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -498,6 +499,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
                  ),
 
+    # If the env var is set, it uses a Ray Communicator wrapping
+    # vLLM's pipeline parallelism communicator to interact with Ray's
+    # Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
+    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_WRAPPED_PP_COMM":
+    lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))),
+
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index e9ad62aeb99a8..37c3fe59c65dd 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -608,6 +608,21 @@ class RayDistributedExecutor(DistributedExecutorBase):
 
             forward_dag = MultiOutputNode(outputs)
 
+        if envs.VLLM_USE_RAY_WRAPPED_PP_COMM:
+            from ray.experimental.channel.accelerator_context import (
+                register_accelerator_context)
+
+            from vllm.distributed.device_communicators.ray_communicator import (
+                RayPPCommunicator)
+            register_accelerator_context(torch_module_name="cuda",
+                                         communicator_cls=RayPPCommunicator)
+            logger.info("Using RayPPCommunicator "
+                        "(which wraps vLLM _PP GroupCoordinator) "
+                        "for Ray Compiled Graph communication.")
+        else:
+            logger.info("Using Ray's NCCL communicator for "
+                        "Ray Compiled Graph communication.")
+
         return forward_dag.experimental_compile(
             enable_asyncio=enable_asyncio,
             _overlap_gpu_communication=envs.

From d84b97a3e33ed79aaba7552bfe5889d363875562 Mon Sep 17 00:00:00 2001
From: XiongfeiWei <isaacwxf23@gmail.com>
Date: Fri, 1 Aug 2025 11:56:08 -0700
Subject: [PATCH 43/54] Add lora test for tp>1 case for TPU. (#21970)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 tests/tpu/lora/test_lora.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index b26bdd34d890e..4c47b8c43caff 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
+from torch_xla._internal import tpu
 
 import vllm
 from vllm.lora.request import LoRARequest
@@ -27,25 +28,31 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
         yield
 
 
-def setup_vllm(num_loras: int) -> vllm.LLM:
+def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
     return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
                     num_scheduler_steps=1,
                     max_model_len=256,
                     max_seq_len_to_capture=256,
                     max_num_seqs=8,
+                    tensor_parallel_size=tp,
                     enable_lora=True,
                     max_loras=num_loras,
                     max_lora_rank=8)
 
 
-def test_single_lora():
+TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips()
+                             ] if tpu.num_available_chips() > 1 else [1]
+
+
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_single_lora(tp: int):
     """
     This test ensures we can run a single LoRA adapter on the TPU backend.
     We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
     will force Qwen2.5-3B-Instruct to claim 1+1=1.
     """
 
-    llm = setup_vllm(1)
+    llm = setup_vllm(1, tp)
 
     prompt = "What is 1+1? \n"
 
@@ -63,7 +70,8 @@ def test_single_lora():
     assert int(answer) == 1
 
 
-def test_lora_hotswapping():
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_lora_hotswapping(tp: int):
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, even
     if we only have space to store 1.
@@ -79,7 +87,7 @@ def test_lora_hotswapping():
         for i in range(1, 5)
     ]
 
-    llm = setup_vllm(1)
+    llm = setup_vllm(1, tp)
 
     prompt = "What is 1+1? \n"
 
@@ -94,7 +102,8 @@ def test_lora_hotswapping():
         assert int(answer) == i + 1
 
 
-def test_multi_lora():
+@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
+def test_multi_lora(tp: int):
     """
     This test ensures we can run multiple LoRA adapters on the TPU backend, when
     we have enough space to store all of them.
@@ -109,7 +118,7 @@ def test_multi_lora():
         for i in range(1, 5)
     ]
 
-    llm = setup_vllm(4)
+    llm = setup_vllm(4, tp)
 
     prompt = "What is 1+1? \n"
 

From 881e1af43a1bb7b4bedd373e413eb7ad9dc9f920 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 1 Aug 2025 22:40:45 +0100
Subject: [PATCH 44/54] [BugFix] Harden distributed DP startup (#21538)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/utils/__init__.py        |  3 ++
 vllm/v1/engine/coordinator.py | 12 +++++++
 vllm/v1/engine/core.py        | 61 +++++++++++++++++++++++------------
 3 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 7405f3986df8d..0d3fa6b059beb 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2794,6 +2794,9 @@ def make_zmq_socket(
     if linger is not None:
         socket.setsockopt(zmq.LINGER, linger)
 
+    if socket_type == zmq.XPUB:
+        socket.setsockopt(zmq.XPUB_VERBOSE, True)
+
     # Determine if the path is a TCP socket with an IPv6 address.
     # Enable IPv6 on the zmq socket if so.
     scheme, host, _ = split_zmq_path(path)
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index 440628576bcb7..8d8d1689e61e3 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -172,6 +172,18 @@ class DPCoordinatorProc:
                 bind=True,
         ) as publish_back:
 
+            # Wait until all engines subscribe.
+            for _ in self.engines:
+                if publish_back.recv() != b'\x01':
+                    logger.error(
+                        "DP Coordinator received unexpected message while "
+                        "waiting for engines to subscribe")
+                    return
+            # Send ready message to engines.
+            publish_back.send(b"READY")
+
+            logger.info("All engine subscriptions received by DP coordinator")
+
             poller = zmq.Poller()
             poller.register(publish_front, zmq.POLLIN)
             poller.register(output_back, zmq.POLLIN)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6ae5736df98b8..0a889b2a0a184 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -461,8 +461,11 @@ class EngineCoreProc(EngineCore):
             self.has_coordinator = addresses.coordinator_output is not None
             self.frontend_stats_publish_address = (
                 addresses.frontend_stats_publish_address)
+            logger.debug("Has DP Coordinator: %s, stats publish address: %s",
+                         self.has_coordinator,
+                         self.frontend_stats_publish_address)
             # Only publish request queue stats to coordinator for "internal"
-            # LB mode.
+            # and "hybrid" LB modes .
             self.publish_dp_lb_stats = (
                 self.has_coordinator
                 and not vllm_config.parallel_config.data_parallel_external_lb)
@@ -472,25 +475,38 @@ class EngineCoreProc(EngineCore):
             super().__init__(vllm_config, executor_class, log_stats,
                              executor_fail_callback)
 
+            # Background Threads and Queues for IO. These enable us to
+            # overlap ZMQ socket IO with GPU since they release the GIL,
+            # and to overlap some serialization/deserialization with the
+            # model forward pass.
+            # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+            ready_event = threading.Event()
+            input_thread = threading.Thread(target=self.process_input_sockets,
+                                            args=(addresses.inputs,
+                                                  addresses.coordinator_input,
+                                                  identity, ready_event),
+                                            daemon=True)
+            input_thread.start()
+
+            self.output_thread = threading.Thread(
+                target=self.process_output_sockets,
+                args=(addresses.outputs, addresses.coordinator_output,
+                      self.engine_index),
+                daemon=True)
+            self.output_thread.start()
+
+            # Don't complete handshake until DP coordinator ready message is
+            # received.
+            while not ready_event.wait(timeout=10):
+                if not input_thread.is_alive():
+                    raise RuntimeError(
+                        "Input socket thread died during startup")
+                assert addresses.coordinator_input is not None
+                logger.info("Waiting for READY message from DP Coordinator...")
+
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
 
-        # Background Threads and Queues for IO. These enable us to
-        # overlap ZMQ socket IO with GPU since they release the GIL,
-        # and to overlap some serialization/deserialization with the
-        # model forward pass.
-        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        threading.Thread(target=self.process_input_sockets,
-                         args=(addresses.inputs, addresses.coordinator_input,
-                               identity),
-                         daemon=True).start()
-        self.output_thread = threading.Thread(
-            target=self.process_output_sockets,
-            args=(addresses.outputs, addresses.coordinator_output,
-                  self.engine_index),
-            daemon=True)
-        self.output_thread.start()
-
     @contextmanager
     def _perform_handshakes(
         self,
@@ -505,10 +521,10 @@ class EngineCoreProc(EngineCore):
 
         For DP=1 or offline mode, this is with the colocated front-end process.
 
-        For DP>1 with internal loadbalancing this is with the shared front-end
+        For DP>1 with internal load-balancing this is with the shared front-end
         process which may reside on a different node.
 
-        For DP>1 with external or hybrid loadbalancing, two handshakes are
+        For DP>1 with external or hybrid load-balancing, two handshakes are
         performed:
             - With the rank 0 front-end process which retrieves the
               DP Coordinator ZMQ addresses and DP process group address.
@@ -772,7 +788,7 @@ class EngineCoreProc(EngineCore):
 
     def process_input_sockets(self, input_addresses: list[str],
                               coord_input_address: Optional[str],
-                              identity: bytes):
+                              identity: bytes, ready_event: threading.Event):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
@@ -809,9 +825,14 @@ class EngineCoreProc(EngineCore):
                 # back to us.
                 input_socket.send(b'')
                 poller.register(input_socket, zmq.POLLIN)
+
             if coord_socket is not None:
+                # Wait for ready message from coordinator.
+                assert coord_socket.recv() == b"READY"
                 poller.register(coord_socket, zmq.POLLIN)
 
+            ready_event.set()
+            del ready_event
             while True:
                 for input_socket, _ in poller.poll():
                     # (RequestType, RequestData)

From 88faa466d788e25082c02dc9688931d7976361f9 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 1 Aug 2025 19:18:38 -0400
Subject: [PATCH 45/54] [CI] Initial tests for SM100 Blackwell runner (#21877)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 | 24 ++++++++++++++++---
 tests/compile/test_fusion_all_reduce.py       | 15 +++++++-----
 .../quantization/test_cutlass_scaled_mm.py    |  5 ----
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 598fd5762985e..cc1223d4c4653 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -647,13 +647,31 @@ steps:
 - label: Blackwell Test
   working_dir: "/vllm-workspace/"
   gpu: b200
-  optional: true
+  # optional: true
   source_file_dependencies:
-  - csrc/
-  - vllm/
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/fusion.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
+    - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    # Fusion
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index b394e0035c689..4c3cf6c2a10cf 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -136,12 +136,15 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
 
 
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model", [
-    TestAllReduceRMSNormModel,
-    TestAllReduceFusedAddRMSNormModel,
-    TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
-    TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
-])
+@pytest.mark.parametrize(
+    "test_model",
+    [
+        TestAllReduceRMSNormModel,
+        TestAllReduceFusedAddRMSNormModel,
+        TestAllReduceFusedAddRMSNormStaticQuantFP8Model,
+        # TODO: Enable with torch==2.8.0
+        # TestAllReduceFusedAddRMSNormStaticQuantFP4Model,
+    ])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [8])
 @pytest.mark.parametrize("hidden_size", [16])
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 544e6dc197904..8730eeaaa761c 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -559,8 +559,6 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
         m_a_scales = m_g if per_act_token else 1
         n_b_scales = n_g if per_out_ch else 1
 
-        print("shape:", m_g, n_g, k_g)
-
         # Create group-specific A and B (FP8) and output (FP16/FP32)
         a_g = to_fp8(torch.randn((m_g, k_g), device=device))
         b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
@@ -639,7 +637,4 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
     for g in range(num_experts):
         baseline = baseline_tensors[g]
         c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
-        print(baseline)
-        print(c)
-        print("*")
         torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)

From eefbf4a68b7b0a5b8364a59647906be1b7f043e2 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 1 Aug 2025 19:18:51 -0400
Subject: [PATCH 46/54] [Perf] Optimize `reshape_and_cache_flash` CUDA Kernel
 (#22036)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../benchmark_reshape_and_cache_flash.py      | 156 ++++++++++++++++++
 csrc/cache_kernels.cu                         |  92 ++++++++---
 2 files changed, 225 insertions(+), 23 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_reshape_and_cache_flash.py

diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
new file mode 100644
index 0000000000000..d4648c18f31d5
--- /dev/null
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    FlexibleArgumentParser,
+    create_kv_caches_with_random_flash,
+)
+
+logger = init_logger(__name__)
+
+
+@torch.inference_mode()
+def run_benchmark(
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    kv_cache_layout: str,
+    num_iters: int,
+    device: str = "cuda",
+) -> float:
+    """Return latency (seconds) for given num_tokens."""
+
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+
+    current_platform.seed_everything(42)
+    torch.set_default_device(device)
+
+    # create random key / value tensors [T, H, D].
+    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+    value = torch.randn_like(key)
+
+    # prepare the slot mapping.
+    # each token is assigned a unique slot in the KV-cache.
+    num_slots = block_size * num_blocks
+    if num_tokens > num_slots:
+        raise ValueError("num_tokens cannot exceed the total number of cache slots")
+    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+
+    key_caches, value_caches = create_kv_caches_with_random_flash(
+        num_blocks,
+        block_size,
+        1,  # num_layers
+        num_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+        cache_layout=kv_cache_layout,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # compute per-kernel scaling factors for fp8 conversion (if used).
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
+    def run_cuda_benchmark(n_iters: int) -> float:
+        nonlocal key, value, key_cache, value_cache, slot_mapping
+        torch.cuda.synchronize()
+        start = time.perf_counter()
+        for _ in range(n_iters):
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+        torch.cuda.synchronize()
+        end = time.perf_counter()
+        return (end - start) / n_iters
+
+    # warm-up
+    run_cuda_benchmark(3)
+
+    lat = run_cuda_benchmark(num_iters)
+
+    # free tensors to mitigate OOM when sweeping
+    del key, value, key_cache, value_cache, slot_mapping
+    torch.cuda.empty_cache()
+
+    return lat
+
+
+def main(args):
+    rows = []
+    for layout in ["NHD", "HND"]:
+        for exp in range(1, 17):
+            n_tok = 2**exp
+            lat = run_benchmark(
+                num_tokens=n_tok,
+                num_heads=args.num_heads,
+                head_size=args.head_size,
+                block_size=args.block_size,
+                num_blocks=args.num_blocks,
+                dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+                kv_cache_dtype=args.kv_cache_dtype,
+                kv_cache_layout=layout,
+                num_iters=args.iters,
+                device="cuda",
+            )
+            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
+
+    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+
+    parser.add_argument("--num-heads", type=int, default=128)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--num-blocks", type=int, default=128 * 512)
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["half", "bfloat16", "float"],
+        default="bfloat16",
+    )
+
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8"],
+        default="auto",
+    )
+
+    parser.add_argument("--iters", type=int, default=100)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 88559c8fe7183..131dcb15cd7e9 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -5,6 +5,7 @@
 #include "cuda_utils.h"
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
+#include "quantization/vectorization_utils.cuh"
 
 #ifdef USE_ROCM
   #include "quantization/fp8/amd/quant_utils.cuh"
@@ -261,14 +262,26 @@ __global__ void reshape_and_cache_kernel(
   }
 }
 
+// Used by vectorization_utils to copy/convert one element
+template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
+struct CopyWithScaleOp {
+  float scale;
+
+  __device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      dst = static_cast<OutT>(src);
+    } else {
+      dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
+    }
+  }
+};
+
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
 __global__ void reshape_and_cache_flash_kernel(
     const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size]
     const scalar_t* __restrict__ value,  // [num_tokens, num_heads, head_size]
-    cache_t* __restrict__ key_cache,     // [num_blocks, block_size, num_heads,
-                                         // head_size]
-    cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
-                                         // head_size]
+    cache_t* __restrict__ key_cache,     // NHD or HND, shape see comments below
+    cache_t* __restrict__ value_cache,   // same above
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int64_t block_stride, const int64_t page_stride,
     const int64_t head_stride, const int64_t key_stride,
@@ -282,25 +295,58 @@ __global__ void reshape_and_cache_flash_kernel(
   }
   const int64_t block_idx = slot_idx / block_size;
   const int64_t block_offset = slot_idx % block_size;
-  const int n = num_heads * head_size;
-  for (int i = threadIdx.x; i < n; i += blockDim.x) {
-    const int64_t src_key_idx = token_idx * key_stride + i;
-    const int64_t src_value_idx = token_idx * value_stride + i;
-    const int head_idx = i / head_size;
-    const int head_offset = i % head_size;
-    const int64_t tgt_key_value_idx = block_idx * block_stride +
-                                      block_offset * page_stride +
-                                      head_idx * head_stride + head_offset;
-    scalar_t tgt_key = key[src_key_idx];
-    scalar_t tgt_value = value[src_value_idx];
-    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-      key_cache[tgt_key_value_idx] = tgt_key;
-      value_cache[tgt_key_value_idx] = tgt_value;
-    } else {
-      key_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
-      value_cache[tgt_key_value_idx] =
-          fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
+  const int n_elems = num_heads * head_size;
+
+  // pointers to the beginning of the source row for this token.
+  const scalar_t* __restrict__ key_src = key + token_idx * key_stride;
+  const scalar_t* __restrict__ value_src = value + token_idx * value_stride;
+
+  // find the start position inside the kv-cache for this token.
+  cache_t* __restrict__ key_dst =
+      key_cache + block_idx * block_stride + block_offset * page_stride;
+  cache_t* __restrict__ value_dst =
+      value_cache + block_idx * block_stride + block_offset * page_stride;
+
+  // this is true for the NHD layout where `head_stride == head_size`
+  const bool is_contiguous_heads = (head_stride == head_size);
+
+  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+  if (is_contiguous_heads) {
+    // NHD layout
+    // kv cache: [num_blocks, block_size, num_heads, head_size]
+    vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
+                                       blockDim.x, k_op);
+
+    vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
+                                       threadIdx.x, blockDim.x, v_op);
+
+  } else {
+    // HND layout: heads are strided, but each head_size segment is contiguous
+    // kv cache: [num_blocks, num_heads, block_size, head_size]
+    const int lane = threadIdx.x & 31;     // 0..31 within warp
+    const int warp_id = threadIdx.x >> 5;  // warp index within block
+    const int warps_per_block = blockDim.x >> 5;
+
+    for (int head = warp_id; head < num_heads; head += warps_per_block) {
+      const scalar_t* __restrict__ k_src_h = key_src + head * head_size;
+      const scalar_t* __restrict__ v_src_h = value_src + head * head_size;
+
+      cache_t* __restrict__ k_dst_h =
+          key_dst + static_cast<int64_t>(head) * head_stride;
+      cache_t* __restrict__ v_dst_h =
+          value_dst + static_cast<int64_t>(head) * head_stride;
+
+      // within each head, let the 32 threads of the warp perform the vector
+      // copy
+      vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
+                                         k_op);
+
+      vectorize_with_alignment<VEC_SIZE>(v_src_h, v_dst_h, head_size, lane, 32,
+                                         v_op);
     }
   }
 }

From 3654847db5a9b9a0955f8416292d94fa1c827f77 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@gmail.com>
Date: Sat, 2 Aug 2025 03:12:19 +0200
Subject: [PATCH 47/54] feat: Add Support GPTQ Quantization MOE on ROCM vllm
 serve (#21733)

---
 .../layers/fused_moe/fused_moe.py             |  4 ++--
 .../layers/quantization/gptq.py               | 22 ++++++++++++++++---
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b69575c7e96de..56d1dfe135b3b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str,
 
 def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
                               num_experts: int, bit: int):
-    return bit == 4 and group_size in [32, 64, 128] and \
-        num_valid_tokens / num_experts <= 6
+    return current_platform.is_cuda() and bit == 4 and \
+        group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6
 
 
 def get_default_config(
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index d3ab1be3bee01..f18c936bac605 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -10,10 +10,11 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_linear_quant_method)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
@@ -110,8 +111,23 @@ class GPTQConfig(QuantizationConfig):
         return cls(weight_bits, group_size, desc_act, lm_head_quantized,
                    dynamic)
 
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQLinearMethod"]:
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
+        if isinstance(layer, FusedMoE):
+            # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
+            from .moe_wna16 import MoeWNA16Config
+
+            config = {
+                "quant_method": "gptq",
+                "bits": self.weight_bits,
+                "group_size": self.group_size,
+                "sym": True,  # GPTQ typically uses symmetric quantization
+                "lm_head": False,
+            }
+            return MoeWNA16Config.from_config(config).get_quant_method(
+                layer, prefix)
+
         return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
 
 

From 23322431c802bb1057426c7ca31b22e859b51644 Mon Sep 17 00:00:00 2001
From: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Date: Sat, 2 Aug 2025 09:49:34 +0800
Subject: [PATCH 48/54] [V1][CUDA] Full cudagraph support for FlashInfer
 (#21367)

---
 vllm/v1/attention/backends/flash_attn.py      |   7 +-
 vllm/v1/attention/backends/flashinfer.py      | 355 ++++++++++++++++--
 vllm/v1/attention/backends/mla/flashmla.py    |   4 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |   4 +-
 vllm/v1/attention/backends/triton_attn.py     |   6 +-
 vllm/v1/attention/backends/utils.py           |  18 +-
 vllm/v1/worker/gpu_model_runner.py            |  24 +-
 vllm/v1/worker/gpu_worker.py                  |   5 +
 8 files changed, 376 insertions(+), 47 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 3f9afa67aef70..f086bab2556eb 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -25,7 +25,8 @@ if is_flash_attn_varlen_func_available():
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.utils import cdiv
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
                                               CommonAttentionMetadata,
                                               get_kv_cache_layout)
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -153,7 +154,9 @@ def _get_sliding_window_configs(
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \
+        else AttentionCGSupport.ALWAYS
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index f8af1d7e41831..0aaad02b5b840 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -4,26 +4,28 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, ClassVar, Optional, Union
 
 import torch
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
                         MultiLevelCascadeAttentionWrapper)
-from flashinfer.decode import trtllm_batch_decode_with_kv_cache
+from flashinfer.decode import (_get_range_buf, get_seq_lens,
+                               trtllm_batch_decode_with_kv_cache)
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.flashinfer import use_trtllm_decode_attention
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout,
-    get_per_layer_parameters, infer_global_hyperparameters,
-    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
+    get_kv_cache_layout, get_per_layer_parameters,
+    infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills,
+    split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 if TYPE_CHECKING:
@@ -174,26 +176,66 @@ class FlashInferMetadata:
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.PURE_DECODE_ONLY
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
         self.device = device
+        self.vllm_config = vllm_config
+        self.cache_config = vllm_config.cache_config
+        self.kv_cache_spec = kv_cache_spec
         self._workspace_buffer = None
         self._prefill_wrapper = None  # Wrapper for prefill/append
-        self._decode_wrapper = None  # Wrapper for decode
+        self._decode_wrapper = None  # Wrapper for decode (general shape)
+
+        self.compilation_config = vllm_config.compilation_config
+        max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len,
+                                     self.kv_cache_spec.block_size)
+        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
+        max_num_pages = max_num_reqs * max_num_pages_per_req
+        self.enable_cuda_graph = self.compilation_config.full_cuda_graph
+        if self.enable_cuda_graph:
+            # For full cudagraph capture, one `decode_wrapper` for each batch
+            # size is needed for FlashInfer.
+            self._decode_wrappers_cudagraph: dict[
+                int, BatchDecodeWithPagedKVCacheWrapper] = {}
+            self._decode_cudagraph_max_bs = min(
+                max_num_reqs, self.compilation_config.max_capture_size)
+
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
         self.global_hyperparameters = infer_global_hyperparameters(
             get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
 
-        self.vllm_config = vllm_config
-        self.cache_config = vllm_config.cache_config
-        self.kv_cache_spec = kv_cache_spec
-        max_num_blocks_per_request = cdiv(
-            vllm_config.model_config.max_model_len,
-            self.kv_cache_spec.block_size)
-        self.block_table_arange = torch.arange(max_num_blocks_per_request,
+        # Preparing persistent buffers (device-side)
+        self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
+                                           dtype=torch.int32,
+                                           device=self.device)
+        self.paged_kv_indices = torch.zeros(
+            max_num_pages,  # max num pages possible
+            dtype=torch.int32,
+            device=self.device)
+        self.paged_kv_last_page_len = torch.zeros(max_num_reqs,
+                                                  dtype=torch.int32,
+                                                  device=self.device)
+        # host-side buffer
+        pin_memory = is_pin_memory_available()
+        self.paged_kv_indptr_cpu = torch.zeros(max_num_reqs + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=pin_memory)
+        self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
+                                                dtype=torch.int32,
+                                                device="cpu",
+                                                pin_memory=pin_memory)
+        self.paged_kv_last_page_len_cpu = torch.zeros(max_num_reqs,
+                                                      dtype=torch.int32,
+                                                      device="cpu",
+                                                      pin_memory=pin_memory)
+
+        self.block_table_arange = torch.arange(max_num_pages_per_req,
                                                dtype=torch.int32,
                                                device=self.device)
 
@@ -217,8 +259,16 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 self._get_workspace_buffer(), get_kv_cache_layout())
         return self._prefill_wrapper
 
-    def _get_decode_wrapper(self):
-        if self._decode_wrapper is None:
+    def _get_decode_wrapper(self,
+                            batch_size: int,
+                            use_cudagraph: bool = False):
+        if use_cudagraph:
+            decode_wrapper = self._decode_wrappers_cudagraph.get(
+                batch_size, None)
+        else:
+            decode_wrapper = self._decode_wrapper
+
+        if decode_wrapper is None:
             num_qo_heads = (
                 self.vllm_config.model_config.get_num_attention_heads(
                     self.vllm_config.parallel_config))
@@ -226,11 +276,32 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 self.vllm_config.parallel_config)
             use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
                 num_qo_heads // num_kv_heads > 4)
-            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+
+            if use_cudagraph:
+                paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1]
+                paged_kv_indices = self.paged_kv_indices
+                paged_kv_last_page_len = self.paged_kv_last_page_len[:
+                                                                     batch_size]
+            else:
+                paged_kv_indptr = None
+                paged_kv_indices = None
+                paged_kv_last_page_len = None
+            decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
                 self._get_workspace_buffer(),
                 get_kv_cache_layout(),
+                use_cuda_graph=use_cudagraph,
+                paged_kv_indptr_buffer=paged_kv_indptr,
+                paged_kv_indices_buffer=paged_kv_indices,
+                paged_kv_last_page_len_buffer=paged_kv_last_page_len,
                 use_tensor_cores=use_tensor_cores)
-        return self._decode_wrapper
+
+            # save the decode wrapper
+            if use_cudagraph:
+                self._decode_wrappers_cudagraph[batch_size] = decode_wrapper
+            else:
+                self._decode_wrapper = decode_wrapper
+
+        return decode_wrapper
 
     def _get_cascade_wrapper(self):
         if self._cascade_wrapper is None:
@@ -308,16 +379,44 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 )
 
             if num_decodes > 0:
-                attn_metadata.decode_wrapper = self._get_decode_wrapper()
+                pure_decode = num_prefills == 0
+                # possible required padding for cudagraph replay
+                use_cudagraph = (self.enable_cuda_graph and pure_decode and
+                                 num_decodes <= self._decode_cudagraph_max_bs)
+                if use_cudagraph:
+                    num_input_tokens = (
+                        self.vllm_config.pad_for_cudagraph(num_decodes))
+                    # Carefully fulfill the padding region with reasonable value
+                    # on cpu.
+                    # Make sure paged_kv_indptr_cpu is not decreasing
+                    self.paged_kv_indptr_cpu[1 + num_decodes:1 +
+                                             num_input_tokens].fill_(
+                                                 attn_metadata.
+                                                 paged_kv_indptr_cpu[-1])
+                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
+                    # This is because flashinfer treats 0 as a full page
+                    # instead of empty.
+                    self.paged_kv_last_page_len_cpu[
+                        num_decodes:num_input_tokens].fill_(1)
+
+                else:
+                    num_input_tokens = num_decodes
+
+                attn_metadata.decode_wrapper = self._get_decode_wrapper(
+                    num_input_tokens, use_cudagraph)
                 if not use_trtllm_decode_attention(
                         num_decodes, attn_metadata.max_seq_len,
                         self.cache_config.cache_dtype,
                         attn_metadata.num_qo_heads, attn_metadata.num_kv_heads,
                         attn_metadata.head_dim):
-                    attn_metadata.decode_wrapper.plan(
-                        attn_metadata.paged_kv_indptr_cpu[:num_decodes + 1],
+                    # Use the persistent buffer with padding length,
+                    # instead of the same address but chunked version
+                    # in atten_metadata when using cudagraph.
+                    fast_plan_decode(
+                        attn_metadata.decode_wrapper,
+                        self.paged_kv_indptr_cpu[:num_input_tokens + 1],
                         attn_metadata.paged_kv_indices,
-                        attn_metadata.paged_kv_last_page_len_cpu[:num_decodes],
+                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
                         attn_metadata.num_qo_heads,
                         attn_metadata.num_kv_heads,
                         attn_metadata.head_dim,
@@ -336,6 +435,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False) -> FlashInferMetadata:
+        num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
             split_decodes_and_prefills(common_attn_metadata)
@@ -381,18 +481,26 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                        non_blocking=True)
         mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0)
                 < block_table_bounds.unsqueeze(1))
-        paged_kv_indices = block_table_tensor[:, :max_num_blocks][mask]
+        # write self.paged_kv_indices inplace
+        num_actual_pages = torch.sum(mask)
+        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
+        torch.masked_select(block_table_tensor[:, :max_num_blocks],
+                            mask,
+                            out=paged_kv_indices)
 
-        paged_kv_indptr_cpu = torch.zeros(len(block_table_bounds_cpu) + 1,
-                                          dtype=torch.int32,
-                                          device='cpu')
-        paged_kv_indptr_cpu[1:] = block_table_bounds_cpu.cumsum(
-            dim=0, dtype=torch.int32)
+        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
+        torch.cumsum(block_table_bounds_cpu,
+                     dim=0,
+                     dtype=torch.int32,
+                     out=self.paged_kv_indptr_cpu[1:1 + num_reqs])
 
         paged_kv_last_page_len_cpu = seq_lens_cpu % page_size
-        paged_kv_last_page_len_cpu = torch.where(
-            paged_kv_last_page_len_cpu == 0, page_size,
-            paged_kv_last_page_len_cpu)
+        # write self.paged_kv_last_page_len_cpu inplace
+        torch.where(paged_kv_last_page_len_cpu == 0,
+                    torch.tensor(page_size),
+                    paged_kv_last_page_len_cpu,
+                    out=self.paged_kv_last_page_len_cpu[:num_reqs])
+
         cache_dtype = self.cache_config.cache_dtype
         if cache_dtype.startswith("fp8"):
             kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
@@ -402,9 +510,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
             qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu,
-            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
+            paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs],
             paged_kv_indices=paged_kv_indices,
-            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
+            paged_kv_last_page_len_cpu=self.
+            paged_kv_last_page_len_cpu[:num_reqs],
             num_qo_heads=self.vllm_config.model_config.get_num_attention_heads(
                 self.vllm_config.parallel_config),
             num_kv_heads=self.kv_cache_spec.num_kv_heads,
@@ -431,6 +540,26 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         return attn_metadata
 
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata):
+        """
+        This method builds the metadata for full cudagraph capture.
+        Currently, only decode is supported for full cudagraphs with FlashInfer.
+        """
+        m = common_attn_metadata
+
+        assert m.num_reqs == m.num_actual_tokens, \
+            "FlashInfer only supports decode-only full CUDAGraph capture. " \
+            "Make sure all cudagraph capture sizes <= max_num_seq."
+
+        m.max_query_len = 1  # decode-only
+
+        return self.build(0, m)
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        return common_attn_metadata.max_query_len == 1
+
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype:
             # TODO: The cascade wrapper currently does not support setting
@@ -638,3 +767,163 @@ class FlashInferImpl(AttentionImpl):
                         out=output[:num_decode_tokens],
                     )
         return output_padded
+
+
+def fast_plan_decode(
+    self,  # decode wrapper
+    indptr_cpu: torch.Tensor,
+    indices: torch.Tensor,
+    last_page_len_cpu: torch.Tensor,
+    num_qo_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+    page_size: int,
+    pos_encoding_mode: str = "NONE",
+    window_left: int = -1,
+    logits_soft_cap: Optional[float] = None,
+    q_data_type: Optional[Union[str, torch.dtype]] = "float16",
+    kv_data_type: Optional[Union[str, torch.dtype]] = None,
+    data_type: Optional[Union[str, torch.dtype]] = None,
+    sm_scale: Optional[float] = None,
+    rope_scale: Optional[float] = None,
+    rope_theta: Optional[float] = None,
+    non_blocking: bool = True,
+) -> None:
+    """
+    A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for 
+    cudagraph capture/replay, while the no cudagraph version turns back 
+    to the original plan.
+    using original plan after passing host-side buffers:
+    - only host-to-device copy of indptr and last_page_len buffers
+    Modifications for cudagraph:
+    - only host-to-device copy of indptr and last_page_len buffers.
+    - avoid device-to-device copy of indices buffer.
+
+    Part of the code get inspiration from the original plan from FlashInfer repo
+    and the implementation of fast_decode_plan for FlashInfer in SGlang repo.
+    """
+    # Warm up with the original plan if it is first call, and always run the
+    # original plan if we run for dynamic shape. For fixed shape (cudagraph),
+    # this warm up is to generate the _cached_module for the decode wrapper.
+    if not self.is_cuda_graph_enabled or \
+        getattr(self, "vllm_first_call", True):
+        self.plan(
+            indptr_cpu,
+            indices,
+            last_page_len_cpu,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            pos_encoding_mode,
+            window_left,
+            logits_soft_cap,
+            q_data_type,
+            kv_data_type,
+            data_type,
+            sm_scale,
+            rope_scale,
+            rope_theta,
+            non_blocking,
+        )
+        self.vllm_first_call = False
+        return
+
+    assert self.is_cuda_graph_enabled, "Should be cudagraph only here"
+
+    batch_size = len(last_page_len_cpu)
+    if logits_soft_cap is None:
+        logits_soft_cap = 0.0
+
+    # Handle data types consistently
+    if data_type is not None:
+        if q_data_type is None:
+            q_data_type = data_type
+        if kv_data_type is None:
+            kv_data_type = data_type
+    elif q_data_type is None:
+        q_data_type = "float16"
+
+    if kv_data_type is None:
+        kv_data_type = q_data_type
+    q_data_type = getattr(torch, q_data_type) if isinstance(
+        q_data_type, str) else q_data_type
+    kv_data_type = getattr(torch, kv_data_type) if isinstance(
+        kv_data_type, str) else kv_data_type
+
+    if self.use_tensor_cores:
+        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
+
+    if batch_size != self._fixed_batch_size:
+        raise ValueError(
+            "The batch size should be fixed in cudagraph mode, the runtime "
+            "batch size {} mismatches the batch size set during "
+            "initialization {}".format(batch_size, self._fixed_batch_size))
+    if len(indices) > len(self._paged_kv_indices_buf):
+        raise ValueError(
+            "The size of indices should be less than or equal to the "
+            "allocated buffer")
+
+    # host-to-device copy for the indptr buffer
+    self._paged_kv_indptr_buf.copy_(indptr_cpu, non_blocking=True)
+    # host-to-device copy for the last_page_len buffer
+    self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu,
+                                           non_blocking=True)
+
+    indptr_host = indptr_cpu
+    last_page_len_host = last_page_len_cpu
+
+    if self.use_tensor_cores:
+        kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host,
+                                        page_size)
+
+        try:
+            # Make sure we pass exactly 15 arguments for tensor core version
+            self._plan_info = self._cached_module.plan(
+                self._float_workspace_buffer,
+                self._int_workspace_buffer,
+                self._pin_memory_int_workspace_buffer,
+                qo_indptr_host,
+                indptr_host,
+                kv_lens_arr_host,
+                batch_size,  # total_num_rows
+                batch_size,
+                num_qo_heads,
+                num_kv_heads,
+                page_size,
+                self.is_cuda_graph_enabled,
+                head_dim,
+                head_dim,
+                False,  # causal
+            )
+        except Exception as e:
+            raise RuntimeError(f"Error in tensor core plan: {e}") from e
+    else:
+        try:
+            # Make sure we pass exactly 15 arguments for standard version
+            self._plan_info = self._cached_module.plan(
+                self._float_workspace_buffer,
+                self._int_workspace_buffer,
+                self._pin_memory_int_workspace_buffer,
+                indptr_host,
+                batch_size,
+                num_qo_heads,
+                num_kv_heads,
+                page_size,
+                self.is_cuda_graph_enabled,
+                window_left,
+                logits_soft_cap,
+                head_dim,
+                head_dim,
+                torch.empty(0, dtype=q_data_type),
+                torch.empty(0, dtype=kv_data_type),
+            )
+        except Exception as e:
+            raise RuntimeError(f"Error in standard plan: {e}") from e
+
+    self._pos_encoding_mode = pos_encoding_mode
+    self._window_left = window_left
+    self._logits_soft_cap = logits_soft_cap
+    self._sm_scale = sm_scale
+    self._rope_scale = rope_scale
+    self._rope_theta = rope_theta
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 39463b9c06164..b5aecff9937f3 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -18,6 +18,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
@@ -54,7 +55,8 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True  # Decode-only
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.PURE_DECODE_ONLY
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 5c5891f035ae2..8b55e1a301992 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -17,6 +17,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 # yapf: enable
@@ -64,7 +65,8 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 
 
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True  # decode only
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.PURE_DECODE_ONLY
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 195fbd3b1b9c4..942cb95eefa2f 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -18,7 +18,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -57,7 +58,8 @@ class TritonAttentionMetadata:
 
 class TritonAttentionMetadataBuilder(
         AttentionMetadataBuilder[TritonAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.ALWAYS
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 36bacf0cb36f8..d39cc0a39f45c 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import abc
+import enum
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass, make_dataclass
@@ -65,9 +66,24 @@ class CommonAttentionMetadata:
 M = TypeVar("M")
 
 
+class AttentionCGSupport(enum.Enum):
+    """ Constants for the cudagraph support of the attention backend
+    Here we do not consider the cascade attention, as currently
+    it is never cudagraph supported."""
+
+    NEVER = 0
+    """NO cudagraph support"""
+    PURE_DECODE_ONLY = 1
+    """Cudagraph supported for pure decode, need to run without
+    cudagraph for mixed prefill-decode batches"""
+    ALWAYS = 2
+    """Cudagraph always supported"""
+
+
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # Does this backend/builder support CUDA Graphs for attention.
-    full_cudagraph_supported: ClassVar[bool] = False
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.NEVER
 
     @abstractmethod
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 29cda4d837bf3..d5a5799efb47c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -47,7 +47,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         is_pin_memory_available, round_up, supports_dynamo)
 from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata,
+    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     make_kv_sharing_fast_prefill_attention_metadata,
     make_local_attention_virtual_batches)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -2619,12 +2619,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.device,
         )
 
-        if (self.full_cuda_graph
-                and not attn_metadata_builder_i.full_cudagraph_supported):
-            raise ValueError(
-                f"Full CUDAGraph not supported for "
-                f"{attn_backend_i.__name__}. Turn off CompilationConfig."
-                f"full_cuda_graph or use a different attention backend.")
+        if self.full_cuda_graph:
+            if attn_metadata_builder_i.attn_cudagraph_support == \
+                AttentionCGSupport.NEVER:
+                raise ValueError(f"Full CUDAGraph not supported for "
+                                 f"{attn_backend_i.__name__}. Turn off "
+                                 f"CompilationConfig.full_cuda_graph or use a "
+                                 f" different attention backend.")
+            if attn_metadata_builder_i.attn_cudagraph_support == \
+                AttentionCGSupport.PURE_DECODE_ONLY:
+                # Limit the max cudagraph size to the max number of
+                # sequences for pure decode only cudagraph backend,
+                # whose max_query_len is 1.
+                self.cudagraph_batch_sizes = [
+                    size for size in self.cudagraph_batch_sizes
+                    if size <= self.scheduler_config.max_num_seqs
+                ]
         return attn_backend_i, attn_metadata_builder_i
 
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0f46ed223ab88..4bc4ece9a0df4 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -321,11 +321,16 @@ class Worker(WorkerBase):
         if get_pp_group().is_last_rank:
             max_num_reqs = min(self.scheduler_config.max_num_seqs,
                                self.scheduler_config.max_num_batched_tokens)
+            # activate building attn_metadata for this dummy run to avoid
+            # potential illegal memory access for full cudagraph relay.
+            attn_cudagraph = self.compilation_config.full_cuda_graph and\
+                not self.model_config.enforce_eager
 
             # We skip EPLB here since we don't want to record dummy metrics
             hidden_states, last_hidden_states = \
                 self.model_runner._dummy_run(
                     num_tokens=max_num_reqs,
+                    capture_attn_cudagraph=attn_cudagraph,
                     skip_eplb=True,
                 )
             if self.model_runner.is_pooling_model:

From ee2eb6ecd86be4b47e334f74feb7874b9a41ca25 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Sat, 2 Aug 2025 10:34:37 +0800
Subject: [PATCH 49/54] [Model] Qwen2.5 VL SiLU-and-Mul (#22066)

Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 44 +++++++++++-------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index c4c4650f569e1..04e64422d2e0b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -43,9 +43,10 @@ from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -171,16 +172,12 @@ class Qwen2_5_VisionMLP(nn.Module):
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
-        self.gate_proj = ColumnParallelLinear(in_features,
-                                              hidden_features,
-                                              bias=bias,
-                                              quant_config=quant_config,
-                                              prefix=f"{prefix}.gate_proj")
-        self.up_proj = ColumnParallelLinear(in_features,
-                                            hidden_features,
-                                            bias=bias,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.up_proj")
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(hidden_features,
                                            in_features,
                                            bias=bias,
@@ -189,10 +186,9 @@ class Qwen2_5_VisionMLP(nn.Module):
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
-        x_gate, _ = self.gate_proj(x)
-        x_gate = self.act_fn(x_gate)
-        x_up, _ = self.up_proj(x)
-        x_down, _ = self.down_proj(x_gate * x_up)
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x_down, _ = self.down_proj(x)
         return x_down
 
 
@@ -540,14 +536,14 @@ class Qwen2_5_VisionTransformer(nn.Module):
         self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
 
         self.blocks = nn.ModuleList([
-            Qwen2_5_VisionBlock(
-                dim=self.hidden_size,
-                num_heads=self.num_heads,
-                mlp_hidden_dim=vision_config.intermediate_size,
-                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
-                norm_layer=norm_layer,
-                quant_config=quant_config,
-                prefix=f"{prefix}.blocks.{layer_idx}")
+            Qwen2_5_VisionBlock(dim=self.hidden_size,
+                                num_heads=self.num_heads,
+                                mlp_hidden_dim=vision_config.intermediate_size,
+                                act_fn=get_act_and_mul_fn(
+                                    vision_config.hidden_act),
+                                norm_layer=norm_layer,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.blocks.{layer_idx}")
             for layer_idx in range(depth)
         ])
         self.merger = Qwen2_5_VisionPatchMerger(
@@ -752,6 +748,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
             ("attn.qkv.", "attn.q.", "q"),
             ("attn.qkv.", "attn.k.", "k"),
             ("attn.qkv.", "attn.v.", "v"),
+            ("mlp.gate_up_proj.", "mlp.gate_proj.", 0),
+            ("mlp.gate_up_proj.", "mlp.up_proj.", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: set[str] = set()

From 57393715e804387588241fbdb4ec94a7570230b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 2 Aug 2025 04:41:40 +0200
Subject: [PATCH 50/54] [Misc] `VLLM_TARGET_DEVICE.lower()` (#22101)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 2fda2903179b5..c161fa0dff6ba 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -213,7 +213,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, neuron, cpu]
     "VLLM_TARGET_DEVICE":
-    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
+    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
 
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs

From a65f46be5ea9a92dde48df2b951c1915aa1d9595 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Sat, 2 Aug 2025 08:12:03 +0530
Subject: [PATCH 51/54] [Misc] DeepGemmExperts : Avoid JIT generation in the
 hot-path (#21955)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 vllm/envs.py                                  |  9 +++
 .../layers/fused_moe/deep_gemm_moe.py         | 77 ++++++++++++++++++-
 vllm/utils/deep_gemm.py                       |  7 ++
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index c161fa0dff6ba..2d470c6dccbfd 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -126,6 +126,7 @@ if TYPE_CHECKING:
     VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
@@ -910,6 +911,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_DEEP_GEMM":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 
+    # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
+    # JIT all the required kernels before model execution so there is no
+    # JIT'ing in the hot-path. However, this warmup increases the engine
+    # startup time by a couple of minutes.
+    # Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup.
+    "VLLM_SKIP_DEEP_GEMM_WARMUP":
+    lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
+
     # Allow use of FlashInfer MoE kernels for fused moe ops.
     "VLLM_USE_FLASHINFER_MOE_FP8":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index b89e5ac6f093e..bd3605378b6dc 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -4,7 +4,9 @@ import functools
 from typing import Any, Optional
 
 import torch
+from tqdm import tqdm
 
+import vllm.envs as env
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -17,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
-from vllm.utils import has_deep_gemm
+from vllm.utils import has_deep_gemm, run_once
 from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
 
 logger = init_logger(__name__)
@@ -82,6 +84,65 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
     return True
 
 
+@run_once
+def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor,
+                                          w1_scale: torch.Tensor,
+                                          w2_scale: torch.Tensor,
+                                          num_topk: int):
+    """
+    DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the
+    input tensor shapes. In this function, we construct all possible input
+    tensor shapes so all the kernels are JIT'ed and cached.
+    Note that this warmup is expected to happen during the model profile
+    call and not during actual model inference.
+    """
+
+    assert w1.size(0) == w2.size(0), (
+        "w1 and w2 must have the same number of experts")
+
+    block_m = deep_gemm_block_shape()[0]
+    num_experts = w1.size(0)
+    device = w1.device
+
+    # This is the maximum GroupedGemm M size that we expect to run
+    # the grouped_gemm with.
+    MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE,
+                              num_topk,
+                              num_experts,
+                              block_m,
+                              expert_tokens_meta=None)
+    # Distribute expert-ids evenly.
+    MAX_BLOCKS = MAX_M // block_m
+    expert_ids_block = torch.randint(low=0,
+                                     high=num_experts,
+                                     size=(MAX_BLOCKS, ),
+                                     device=device,
+                                     dtype=torch.int32)
+    expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
+
+    def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
+
+        _, n, k = w.size()
+        a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn)
+        a1q_scales = torch.empty((MAX_M, k // block_m),
+                                 device=device,
+                                 dtype=torch.float32)
+        out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
+
+        pbar = tqdm(total=MAX_BLOCKS,
+                    desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})")
+        num_tokens = MAX_M
+        while num_tokens > 0:
+            m_grouped_fp8_gemm_nt_contiguous(
+                (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale),
+                out[:num_tokens], expert_ids[:num_tokens])
+            pbar.update(1)
+            num_tokens = num_tokens - block_m
+
+    _warmup(w1, w1_scale)
+    _warmup(w2, w2_scale)
+
+
 class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(self):
@@ -156,6 +217,20 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     ):
         assert self.block_shape is not None
         assert a1q_scale is not None
+        assert w1_scale is not None
+        assert w2_scale is not None
+
+        if not env.VLLM_SKIP_DEEP_GEMM_WARMUP:
+            # DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing
+            # to happen during actual model-inference. The
+            # `warmup_deepgemm_kernels` function is a `run_once` decorated
+            # function that executes during the model profile run. This warmup
+            # should create all the required JITs for the current model.
+            warmup_deepgemm_gg_contiguous_kernels(w1,
+                                                  w2,
+                                                  w1_scale,
+                                                  w2_scale,
+                                                  num_topk=topk_ids.size(1))
 
         a1q = hidden_states
         _, N, K = w1.size()
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 4dedee2a3f862..8ab34e7505ee2 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -8,6 +8,7 @@ from __future__ import annotations
 
 import functools
 import importlib
+import os
 from typing import Any, Callable, NoReturn
 
 import torch
@@ -77,6 +78,12 @@ def _lazy_init() -> None:
     if not has_deep_gemm():
         return
 
+    # Set up deep_gemm cache path
+    DEEP_GEMM_JIT_CACHE_ENV_NAME = 'DG_JIT_CACHE_DIR'
+    if not os.environ.get(DEEP_GEMM_JIT_CACHE_ENV_NAME, None):
+        os.environ[DEEP_GEMM_JIT_CACHE_ENV_NAME] = os.path.join(
+            envs.VLLM_CACHE_ROOT, "deep_gemm")
+
     _dg = importlib.import_module("deep_gemm")
 
     _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt",

From 9f9c38c392476fd35b9154221c00a2255dcfd010 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 1 Aug 2025 22:43:37 -0400
Subject: [PATCH 52/54] [Speculators][Speculative Decoding] Add Qwen Eagle3
 Support (#21835)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../speculators/test_eagle3.py                | 14 +++++++++++--
 vllm/config.py                                | 15 ++++++++++---
 vllm/model_executor/models/qwen2.py           | 21 +++++++++++++------
 vllm/model_executor/models/qwen3.py           |  7 +++++++
 4 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py
index c58fc8c0dc5f4..c46ac7a88b751 100644
--- a/tests/speculative_decoding/speculators/test_eagle3.py
+++ b/tests/speculative_decoding/speculators/test_eagle3.py
@@ -6,11 +6,21 @@ import torch
 
 @pytest.mark.parametrize(
     "model_path",
-    [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717"),
-     ("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
+    [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
 def test_llama(vllm_runner, example_prompts, model_path):
     with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                   max_tokens=20)
         print(vllm_outputs)
         assert vllm_outputs
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")])
+def test_qwen(vllm_runner, example_prompts, model_path):
+    with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                  max_tokens=20)
+        print(vllm_outputs)
+        assert vllm_outputs
diff --git a/vllm/config.py b/vllm/config.py
index dabb4b524dfd8..95dae4275edf3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3175,10 +3175,19 @@ class SpeculativeConfig:
                              "speculative decoding is > 1, but got "
                              f"{self.disable_by_batch_size=}")
 
-        if self.method == "eagle3" and self.target_model_config and \
-            "llama" not in self.target_model_config.hf_text_config.model_type:
+        from vllm.transformers_utils.configs import SpeculatorsConfig
+
+        eagle3_target_supported = ["llama"]
+        if self.draft_model_config and isinstance(
+                self.draft_model_config.hf_config, SpeculatorsConfig):
+            eagle3_target_supported.append("qwen")
+
+        if self.method == "eagle3" and self.target_model_config and not any(
+                supported_model in
+                self.target_model_config.hf_text_config.model_type
+                for supported_model in eagle3_target_supported):
             raise ValueError(
-                "Eagle3 is only supported for Llama models. "
+                f"Eagle3 is only supported for {eagle3_target_supported} models. "  # noqa: E501
                 f"Got {self.target_model_config.hf_text_config.model_type=}")
 
         return self
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 23f65b99c22ce..0e7507a4570be 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -330,6 +330,8 @@ class Qwen2Model(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
+        self.aux_hidden_state_layers: tuple[int] = tuple()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
@@ -350,18 +352,25 @@ class Qwen2Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                residual,
-            )
+
+        aux_hidden_states = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
                 "residual": residual
             })
+
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
+
         return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str,
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 393ce41a91a00..d2ae8959b103d 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -288,6 +288,13 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 

From 8d524ce79ffd0571d6a576cb9a5c21feab187246 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 2 Aug 2025 03:45:27 +0100
Subject: [PATCH 53/54] [BugFix] Improve internal DP load balancing (#21617)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/api_server.py |   3 +
 vllm/v1/engine/async_llm.py           |   4 +
 vllm/v1/engine/coordinator.py         | 110 +++++++++++++++++---------
 vllm/v1/engine/core.py                |  13 +--
 vllm/v1/engine/core_client.py         |  46 +++++++----
 vllm/v1/metrics/stats.py              |   4 +
 vllm/v1/utils.py                      |   1 +
 7 files changed, 122 insertions(+), 59 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b8ec5461f7719..9bf4702320788 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -199,6 +199,8 @@ async def build_async_engine_client_from_engine_args(
 
         from vllm.v1.engine.async_llm import AsyncLLM
         async_llm: Optional[AsyncLLM] = None
+        client_count = client_config.pop(
+            "client_count") if client_config else 1
         client_index = client_config.pop(
             "client_index") if client_config else 0
         try:
@@ -208,6 +210,7 @@ async def build_async_engine_client_from_engine_args(
                 enable_log_requests=engine_args.enable_log_requests,
                 disable_log_stats=engine_args.disable_log_stats,
                 client_addresses=client_config,
+                client_count=client_count,
                 client_index=client_index)
 
             # Don't keep the dummy data in memory
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 308ca32105ba9..45f450291ab63 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -57,6 +57,7 @@ class AsyncLLM(EngineClient):
         start_engine_loop: bool = True,
         stat_loggers: Optional[list[StatLoggerFactory]] = None,
         client_addresses: Optional[dict[str, str]] = None,
+        client_count: int = 1,
         client_index: int = 0,
     ) -> None:
         """
@@ -120,6 +121,7 @@ class AsyncLLM(EngineClient):
             executor_class=executor_class,
             log_stats=self.log_stats,
             client_addresses=client_addresses,
+            client_count=client_count,
             client_index=client_index,
         )
 
@@ -156,6 +158,7 @@ class AsyncLLM(EngineClient):
             enable_log_requests: bool = False,
             disable_log_stats: bool = False,
             client_addresses: Optional[dict[str, str]] = None,
+            client_count: int = 1,
             client_index: int = 0,
             disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
@@ -176,6 +179,7 @@ class AsyncLLM(EngineClient):
             log_stats=not disable_log_stats,
             usage_context=usage_context,
             client_addresses=client_addresses,
+            client_count=client_count,
             client_index=client_index,
         )
 
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index 8d8d1689e61e3..596edfdbe24f8 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import multiprocessing
 import time
 import weakref
@@ -65,18 +66,14 @@ class DPCoordinator:
 
         # Assume coordinator is colocated with front-end procs when not in
         # either external or hybrid DP LB mode.
+        local_only = not (external_lb or hybrid_lb)
         front_publish_address = get_engine_client_zmq_addr(
-            local_only=not external_lb and not hybrid_lb, host=host)
+            local_only=local_only, host=host)
 
         local_only_eng = dp_size == parallel_config.data_parallel_size_local
         back_publish_address = get_engine_client_zmq_addr(local_only_eng, host)
         back_output_address = get_engine_client_zmq_addr(local_only_eng, host)
 
-        # When in external LB mode, load stats aren't published, only changes
-        # to request wave / running state, so we don't need to rate-limit the
-        # updates to the front-end proc(s).
-        min_stats_update_interval_ms = 0 if external_lb else 100
-
         context = get_mp_context()
         self.proc: multiprocessing.Process = context.Process(
             target=DPCoordinatorProc.run_coordinator,
@@ -86,7 +83,6 @@ class DPCoordinator:
                 "front_publish_address": front_publish_address,
                 "back_output_address": back_output_address,
                 "back_publish_address": back_publish_address,
-                "min_stats_update_interval_ms": min_stats_update_interval_ms,
             },
             daemon=True)
         self.proc.start()
@@ -125,10 +121,6 @@ class DPCoordinatorProc:
 
         self.stats_update_interval_ms = min_stats_update_interval_ms
 
-        self.current_wave = 0
-        self.engines_running = False
-        self.stats_changed = False
-
     @staticmethod
     def run_coordinator(
         engine_count: int,
@@ -155,6 +147,16 @@ class DPCoordinatorProc:
 
         decoder = MsgpackDecoder(EngineCoreOutputs)
 
+        # For tracking request wave progression.
+        current_wave = 0
+        engines_running = False
+
+        # For tracking request counts for internal load-balancing.
+        stats_changed = False
+        last_stats_step = -1
+        last_stats_wave = -1
+        last_step_counts: Optional[list[list[int]]] = None
+
         with make_zmq_socket(
                 path=front_publish_address,  # IPC
                 ctx=self.ctx,
@@ -191,21 +193,33 @@ class DPCoordinatorProc:
             while True:
                 elapsed = int(time.time() * 1000) - last_publish_time
                 # Send at stats_update_interval_ms interval if the stats have
-                # changed, or otherwise every 4 seconds.
+                # changed, or otherwise every 5 seconds.
                 wait_for = (self.stats_update_interval_ms
-                            if self.stats_changed else 4000)
-                events = poller.poll(timeout=max(0, wait_for - elapsed))
+                            if stats_changed else 5000)
+
+                # Wait at least 50ms to ensure we've received all stats for
+                # the current step.
+                min_timeout = 50 if last_step_counts is None else 0
+
+                events = poller.poll(timeout=max(min_timeout, wait_for -
+                                                 elapsed))
                 if not events:
                     # Poller timeout - publish current stats to front-ends.
-                    engine_req_counts_list = self._get_engine_counts()
-                    to_publish = (engine_req_counts_list, self.current_wave,
-                                  self.engines_running)
+                    if last_step_counts is not None:
+                        engine_req_counts_list = last_step_counts
+                        last_step_counts = None
+                    else:
+                        engine_req_counts_list = self._get_engine_counts()
+                        stats_changed = False
+
+                    to_publish = (engine_req_counts_list, current_wave,
+                                  engines_running)
                     publish_front.send(msgspec.msgpack.encode(to_publish))
                     last_publish_time = int(time.time() * 1000)
-                    self.stats_changed = False
                     continue
 
                 events = dict(events)
+                wave_state_changed = False
 
                 if publish_front in events:
                     buffer = publish_front.recv()
@@ -232,7 +246,7 @@ class DPCoordinatorProc:
                             # current_wave
                             # we note that 0 is the wave number for the new
                             # engine
-                            self.engines_running = False
+                            engines_running = False
                             logger.info(
                                 "DPCoordinator scaled up from %s to %s "
                                 "engines", current_count, new_engine_count)
@@ -248,15 +262,15 @@ class DPCoordinatorProc:
                     # engines are paused, so that we can wake the other
                     # engines.
                     engine_to_exclude, wave = decoded
-                    if not self.engines_running:
-                        if wave < self.current_wave:
+                    if not engines_running:
+                        if wave < current_wave:
                             # If the wave number is stale, ensure the message
                             # is handled by all the engines.
                             engine_to_exclude = None
 
-                        self.engines_running = True
-                        self.stats_changed = True
-                        self._send_start_wave(publish_back, self.current_wave,
+                        engines_running = True
+                        wave_state_changed = True
+                        self._send_start_wave(publish_back, current_wave,
                                               engine_to_exclude)
 
                 if output_back in events:
@@ -274,36 +288,56 @@ class DPCoordinatorProc:
                         # 1. Updated request load stats - update our local
                         # state with these.
                         stats = self.engines[eng_index].request_counts
+                        stats_step = scheduler_stats.step_counter
+                        stats_wave = scheduler_stats.current_wave
+                        if (stats_wave > last_stats_wave
+                                or stats_wave == last_stats_wave
+                                and stats_step > last_stats_step):
+                            if stats_changed:
+                                last_step_counts = self._get_engine_counts(
+                                    do_copy=True)
+                            last_stats_step = stats_step
+                            last_stats_wave = stats_wave
+                        elif stats_wave != last_stats_wave or (
+                                stats_step != last_stats_step):
+                            logger.warning(
+                                "Received stats for out-of-order "
+                                "step (%d, %d) from engine %d (expected "
+                                "> (%d, %d))", stats_wave, stats_step,
+                                eng_index, last_stats_wave, last_stats_step)
                         stats[0] = scheduler_stats.num_waiting_reqs
                         stats[1] = scheduler_stats.num_running_reqs
-                        self.stats_changed = True
+                        stats_changed = True
 
                     if (wave := outputs.wave_complete) is not None:
                         # 2. Notification from rank 0 engine that we've
                         # moved into the global paused state
                         # (engines_running==False).
-                        if self.current_wave <= wave:
+                        if current_wave <= wave:
                             new_wave = wave + 1
                             logger.debug("Moving DP wave from %d to %d.",
-                                         self.current_wave, new_wave)
-                            self.current_wave = new_wave
-                            self.engines_running = False
-                            self.stats_changed = True
+                                         current_wave, new_wave)
+                            current_wave = new_wave
+                            engines_running = False
+                            wave_state_changed = True
                     elif (wave := outputs.start_wave) is not None and (
-                            wave > self.current_wave or
-                        (wave == self.current_wave
-                         and not self.engines_running)):
+                            wave > current_wave or
+                        (wave == current_wave and not engines_running)):
                         # 3. The engine received request for a non-current wave
                         # so we must ensure that other engines progress to the
                         # next wave (race condition handling).
                         logger.debug(
                             "Starting wave %d after notification of "
                             "stale wave request from engine.", wave)
-                        self.current_wave = wave
-                        self.engines_running = True
-                        self.stats_changed = True
+                        current_wave = wave
+                        engines_running = True
+                        wave_state_changed = True
                         self._send_start_wave(publish_back, wave, eng_index)
 
+                if wave_state_changed:
+                    message = (None, current_wave, engines_running)
+                    publish_front.send(msgspec.msgpack.encode(message))
+
     @staticmethod
     def _send_start_wave(socket: zmq.Socket, wave: int,
                          exclude_engine_index: Optional[int]):
@@ -316,6 +350,8 @@ class DPCoordinatorProc:
         socket.send_multipart(
             (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded))
 
-    def _get_engine_counts(self) -> list[list[int]]:
+    def _get_engine_counts(self, do_copy=False) -> list[list[int]]:
         """Return list of [waiting, running] count lists for each engine."""
+        if do_copy:
+            return [copy.copy(e.request_counts) for e in self.engines]
         return [e.request_counts for e in self.engines]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0a889b2a0a184..79c47e1028882 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -928,7 +928,7 @@ class DPEngineCoreProc(EngineCoreProc):
     ):
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
-        self.counter = 0
+        self.step_counter = 0
         self.current_wave = 0
         self.last_counts = (0, 0)
 
@@ -999,7 +999,9 @@ class DPEngineCoreProc(EngineCoreProc):
         counts = self.scheduler.get_request_counts()
         if counts != self.last_counts:
             self.last_counts = counts
-            stats = SchedulerStats(*counts)
+            stats = SchedulerStats(*counts,
+                                   step_counter=self.step_counter,
+                                   current_wave=self.current_wave)
             self.output_queue.put_nowait(
                 (-1, EngineCoreOutputs(scheduler_stats=stats)))
 
@@ -1041,15 +1043,16 @@ class DPEngineCoreProc(EngineCoreProc):
                     self.output_queue.put_nowait(
                         (client_index,
                          EngineCoreOutputs(wave_complete=self.current_wave)))
+                # Increment wave count and reset step counter.
                 self.current_wave += 1
+                self.step_counter = 0
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
         # Optimization - only perform finish-sync all-reduce every 32 steps.
-        self.counter += 1
-        if self.counter != 32:
+        self.step_counter += 1
+        if self.step_counter % 32 != 0:
             return True
-        self.counter = 0
 
         return ParallelConfig.has_unfinished_dp(self.dp_group,
                                                 local_unfinished)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 26985df6f62df..4d30bb6b74466 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -86,11 +86,12 @@ class EngineCoreClient(ABC):
         executor_class: type[Executor],
         log_stats: bool,
         client_addresses: Optional[dict[str, str]] = None,
+        client_count: int = 1,
         client_index: int = 0,
     ) -> "MPClient":
         parallel_config = vllm_config.parallel_config
         client_args = (vllm_config, executor_class, log_stats,
-                       client_addresses, client_index)
+                       client_addresses, client_count, client_index)
         if parallel_config.data_parallel_size > 1:
             if parallel_config.data_parallel_external_lb:
                 # External load balancer - client per DP rank.
@@ -727,6 +728,7 @@ class AsyncMPClient(MPClient):
                  executor_class: type[Executor],
                  log_stats: bool,
                  client_addresses: Optional[dict[str, str]] = None,
+                 client_count: int = 1,
                  client_index: int = 0):
         super().__init__(
             asyncio_mode=True,
@@ -929,11 +931,12 @@ class DPAsyncMPClient(AsyncMPClient):
                  executor_class: type[Executor],
                  log_stats: bool,
                  client_addresses: Optional[dict[str, str]] = None,
+                 client_count: int = 1,
                  client_index: int = 0):
         self.current_wave = 0
 
         super().__init__(vllm_config, executor_class, log_stats,
-                         client_addresses, client_index)
+                         client_addresses, client_count, client_index)
 
         # List of [waiting, running] pair per engine.
         # Used only by DPLBAsyncMPClient subclass.
@@ -1029,7 +1032,11 @@ class DPAsyncMPClient(AsyncMPClient):
                     counts, wave, running = msgspec.msgpack.decode(buf)
                     self.current_wave = wave
                     self.engines_running = running
-                    self.lb_engines = counts[count_slice]
+                    if counts is not None:
+                        sliced_counts = counts[count_slice]
+                        self.lb_engines = sliced_counts
+                        logger.debug("Received counts: %s (%s)", sliced_counts,
+                                     count_slice)
 
         resources.stats_update_task = asyncio.create_task(
             run_engine_stats_update_task())
@@ -1065,40 +1072,45 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
                  executor_class: type[Executor],
                  log_stats: bool,
                  client_addresses: Optional[dict[str, str]] = None,
+                 client_count: int = 1,
                  client_index: int = 0):
 
+        self.client_count = client_count
+
         # To route aborts to the correct engine.
         self.reqs_in_flight: dict[str, EngineIdentity] = {}
 
         super().__init__(vllm_config, executor_class, log_stats,
-                         client_addresses, client_index)
+                         client_addresses, client_count, client_index)
 
         assert len(self.core_engines) > 1
 
+        self.eng_start_index = (len(self.core_engines) *
+                                self.client_index) // client_count
+
     def get_core_engine_for_request(
             self, request: EngineCoreRequest) -> EngineIdentity:
         # Engines are in rank order.
+        current_counts = self.lb_engines
         if (eng_index := request.data_parallel_rank) is None:
-            if not self.lb_engines:
+            if not current_counts:
                 return self.core_engine
             # TODO use P2C alg for larger DP sizes
-            num_engines = len(self.lb_engines)
-            min_counts = [sys.maxsize, sys.maxsize]
+            num_engines = len(current_counts)
+            min_score = sys.maxsize
             eng_index = 0
             for i in range(num_engines):
                 # Start from client_index to help with balancing when engines
                 # are empty.
-                idx = (self.client_index + i) % num_engines
-                counts = self.lb_engines[idx]
-                if counts < min_counts:
-                    min_counts = counts
+                idx = (self.eng_start_index + i) % num_engines
+                waiting, running = current_counts[idx]
+                score = waiting * 4 + running
+                if score < min_score:
+                    min_score = score
                     eng_index = idx
-            # Adjust local counts for better balancing between stats updates
-            # from the coordinator (which happen every 100ms).
-            if min_counts[0]:
-                min_counts[0] += 1
-            else:
-                min_counts[1] += 1
+            # Increment local waiting count for better balancing between stats
+            # updates from the coordinator (which happen every 100ms).
+            current_counts[eng_index][0] += self.client_count
 
         chosen_engine = self.core_engines[eng_index]
         # Record which engine is chosen for this request, to handle aborts.
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 1eb10ccb6c493..9a80460261e02 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -33,6 +33,10 @@ class SchedulerStats:
     num_running_reqs: int = 0
     num_waiting_reqs: int = 0
 
+    # These are used for internal DP load-balancing.
+    step_counter: int = 0
+    current_wave: int = 0
+
     kv_cache_usage: float = 0.0
 
     prefix_cache_stats: PrefixCacheStats = field(
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index c74d8c543f76c..d0175695c1d0f 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -154,6 +154,7 @@ class APIServerProcessManager:
             client_config = {
                 "input_address": in_addr,
                 "output_address": out_addr,
+                "client_count": num_servers,
                 "client_index": i
             }
             if stats_update_address is not None:

From 6e8d8c4afbddf725b34ef938616701869f5b3462 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 1 Aug 2025 22:45:46 -0400
Subject: [PATCH 54/54] [Test] Add Unit Test for Batched DeepGEMM (#21559)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/kernels/moe/test_batched_deepgemm.py | 103 +++++++++++++++++++++
 tests/kernels/moe/test_deepgemm.py         |   8 +-
 vllm/utils/deep_gemm.py                    |   4 +-
 3 files changed, 107 insertions(+), 8 deletions(-)
 create mode 100644 tests/kernels/moe/test_batched_deepgemm.py

diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py
new file mode 100644
index 0000000000000..018d4c224f75e
--- /dev/null
+++ b/tests/kernels/moe/test_batched_deepgemm.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    BatchedDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedPrepareAndFinalize, BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported
+
+from .test_deepgemm import make_block_quant_fp8_weights
+
+BLOCK_SIZE = [128, 128]
+
+
+@pytest.mark.skipif(not is_deep_gemm_supported(),
+                    reason="Requires deep_gemm kernels")
+@pytest.mark.parametrize("E", [16, 32])  # number of experts
+@pytest.mark.parametrize("T", [256, 512])  # tokens per expert
+@pytest.mark.parametrize("K", [128, 256])  # hidden dim
+@pytest.mark.parametrize("N", [512, 1024])  # intermediate dim per expert
+@pytest.mark.parametrize("topk", [2, 4])
+def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int,
+                                    monkeypatch):
+    """Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
+
+    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
+
+    device = "cuda"
+    w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(E, N, K, BLOCK_SIZE)
+
+    M = E * T  # total tokens
+    a = torch.randn(M, K, device=device, dtype=torch.bfloat16) / 10.0
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    a.clamp_(fp8_info.min, fp8_info.max)
+
+    # random router outputs → top-k indices / weights
+    router_logits = torch.randn(M, E, device=device, dtype=torch.float32)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    # token number for each expert
+    cnt = torch.bincount(topk_ids.flatten(), minlength=E)
+    max_cnt = int(cnt.max().item())
+    # next power of 2 for max token number
+    max_num_tokens = 1 << (max_cnt - 1).bit_length()
+
+    prep_finalize = BatchedPrepareAndFinalize(
+        max_num_tokens=max_num_tokens,
+        num_local_experts=E,
+        num_dispatchers=1,
+        rank=0,
+    )
+
+    # triton (reference)
+    triton_experts = BatchedTritonExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=1,
+        use_fp8_w8a8=True,
+        per_act_token_quant=False,
+        block_shape=BLOCK_SIZE,
+    )
+    mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts)
+
+    out_triton = mk_triton(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        global_num_experts=E,
+    )
+
+    # deepgemm
+    deepgemm_experts = BatchedDeepGemmExperts(
+        max_num_tokens=max_num_tokens,
+        num_dispatchers=1,
+        block_shape=BLOCK_SIZE,
+        per_act_token_quant=False,
+    )
+    mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts)
+
+    out_deepgemm = mk_deepgemm(
+        hidden_states=a,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=False,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        global_num_experts=E,
+    )
+
+    diff = calc_diff(out_deepgemm, out_triton)
+    assert diff < 1e-3, f"Output diff too large: {diff}"
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index b6ea4ee2324c9..b2b78662c9ded 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -20,11 +20,6 @@ from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
 
 BLOCK_SIZE = [128, 128]
 
-requires_deep_gemm = pytest.mark.skipif(
-    not is_deep_gemm_supported(),
-    reason="Requires deep_gemm kernels",
-)
-
 
 def make_block_quant_fp8_weights(
     e: int,
@@ -152,7 +147,8 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("mnk", MNKs)
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
-@requires_deep_gemm
+@pytest.mark.skipif(not is_deep_gemm_supported(),
+                    reason="Requires deep_gemm kernels")
 def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch):
 
     with monkeypatch.context() as m:
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 8ab34e7505ee2..0edfb01cde9d6 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -23,10 +23,10 @@ def is_deep_gemm_supported() -> bool:
     """Return ``True`` if DeepGEMM is supported on the current platform.
     Currently, only Hopper and Blackwell GPUs are supported.
     """
-    supported_arch = current_platform.is_cuda() and (
+    is_supported_arch = current_platform.is_cuda() and (
         current_platform.is_device_capability(90)
         or current_platform.is_device_capability(100))
-    return has_deep_gemm() and supported_arch
+    return has_deep_gemm() and is_supported_arch
 
 
 @functools.cache