From b1a63d1b3be996babec6411e3abe559796f76ca9 Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Sat, 20 Sep 2025 04:36:34 +0800
Subject: [PATCH 01/24] [BugFix] Make FlashInferMetadataBuilder non-blocking
 (#25040)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/v1/attention/backends/flashinfer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index dda6dd4fbea7a..cb092aa74e7f1 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -585,9 +585,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         kv_data_type=self.kv_cache_dtype,
                     )
                 else:
-                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
+                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
+                        self.device, non_blocking=True)
                     attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
-                        self.device)
+                        self.device, non_blocking=True)
 
             if num_decodes > 0:
                 pure_decode = num_prefills == 0

From ddc9048394ae6294d0db7fd67270efea59c3a065 Mon Sep 17 00:00:00 2001
From: David-Wen <18927700430@163.com>
Date: Sat, 20 Sep 2025 04:44:24 +0800
Subject: [PATCH 02/24] Fix: Correct FusedMoE layer reference in auto_round
 quantization (#24818)

Signed-off-by: David-Wen <18927700430@163.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/quantization/auto_round.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index 1ca92273430dd..bf5141fa48943 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -241,7 +241,7 @@ class AutoRoundConfig(QuantizationConfig):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
-                return AWQMoEMethod(quant_args_marlin, layer.moe)
+                return AWQMoEMethod(quant_args_marlin, layer.moe_config)
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
 
@@ -327,7 +327,7 @@ class AutoRoundConfig(QuantizationConfig):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
-                return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
+                return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config)
             else:
                 from vllm.model_executor.layers.quantization.moe_wna16 import (
                     MoeWNA16Config)

From e69e0b8b5fc5ef2958f7b1fc159119e9c4c0e2d2 Mon Sep 17 00:00:00 2001
From: Alec S <10566873+alecsolder@users.noreply.github.com>
Date: Fri, 19 Sep 2025 17:40:16 -0400
Subject: [PATCH 03/24] [Frontend] Responses API messages out, just harmony for
 now (#24985)

Signed-off-by: Alec Solder <alecs@fb.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .../openai/test_response_api_with_harmony.py    | 15 +++++++++++++++
 vllm/entrypoints/openai/protocol.py             | 17 ++++++++++++++++-
 vllm/entrypoints/openai/serving_responses.py    | 13 +++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 40a22c04b08a5..f3c3148577b85 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -744,3 +744,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
     assert response_2 is not None
     assert response_2.status == "completed"
     assert response_2.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_output_messages_enabled(client: OpenAI, model_name: str,
+                                       server):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the capital of South Korea?",
+        extra_body={"enable_response_messages": True})
+
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.input_messages) > 0
+    assert len(response.output_messages) > 0
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 05d5d6d964dd3..c30681318f693 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -328,6 +328,13 @@ class ResponsesRequest(OpenAIBaseModel):
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
             "to 256 bit). Not supported by vLLM engine V0."))
+
+    enable_response_messages: bool = Field(
+        default=False,
+        description=(
+            "Dictates whether or not to return messages as part of the "
+            "response object. Currently only supported for non-streaming "
+            "non-background and gpt-oss only. "))
     # --8<-- [end:responses-extra-params]
 
     _DEFAULT_SAMPLING_PARAMS = {
@@ -1831,6 +1838,11 @@ class ResponsesResponse(OpenAIBaseModel):
     model: str
     object: Literal["response"] = "response"
     output: list[ResponseOutputItem]
+    # These are populated when enable_response_messages is set to True
+    # TODO: Currently an issue where content of harmony messages
+    # is not available when these are serialized. Metadata is available
+    input_messages: Optional[list[ChatCompletionMessageParam]] = None
+    output_messages: Optional[list[ChatCompletionMessageParam]] = None
     parallel_tool_calls: bool
     temperature: float
     tool_choice: ToolChoice
@@ -1860,6 +1872,8 @@ class ResponsesResponse(OpenAIBaseModel):
         output: list[ResponseOutputItem],
         status: ResponseStatus,
         usage: Optional[ResponseUsage] = None,
+        input_messages: Optional[list[ChatCompletionMessageParam]] = None,
+        output_messages: Optional[list[ChatCompletionMessageParam]] = None,
     ) -> "ResponsesResponse":
 
         incomplete_details: Optional[IncompleteDetails] = None
@@ -1868,7 +1882,6 @@ class ResponsesResponse(OpenAIBaseModel):
         # TODO: implement the other reason for incomplete_details,
         # which is content_filter
         # incomplete_details = IncompleteDetails(reason='content_filter')
-
         return cls(
             id=request.request_id,
             created_at=created_time,
@@ -1877,6 +1890,8 @@ class ResponsesResponse(OpenAIBaseModel):
             metadata=request.metadata,
             model=model_name,
             output=output,
+            input_messages=input_messages,
+            output_messages=output_messages,
             parallel_tool_calls=request.parallel_tool_calls,
             temperature=sampling_params.temperature,
             tool_choice=request.tool_choice,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 4894623aeac28..6e243671af242 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -475,9 +475,14 @@ class OpenAIServingResponses(OpenAIServing):
         # "completed" is implemented as the "catch-all" for now.
         status: ResponseStatus = "completed"
 
+        input_messages = None
+        output_messages = None
         if self.use_harmony:
             assert isinstance(context, HarmonyContext)
             output = self._make_response_output_items_with_harmony(context)
+            if request.enable_response_messages:
+                input_messages = context.messages[:context.num_init_messages]
+                output_messages = context.messages[context.num_init_messages:]
             num_tool_output_tokens = context.num_tool_output_tokens
             if len(output) > 0:
                 if context.finish_reason == "length":
@@ -496,6 +501,12 @@ class OpenAIServingResponses(OpenAIServing):
             output = self._make_response_output_items(request, final_output,
                                                       tokenizer)
 
+            # TODO: context for non-gptoss models doesn't use messages
+            # so we can't get them out yet
+            if request.enable_response_messages:
+                raise NotImplementedError(
+                    "enable_response_messages is currently"
+                    " only supported for gpt-oss")
             # Calculate usage.
             assert final_res.prompt_token_ids is not None
             num_tool_output_tokens = 0
@@ -519,6 +530,8 @@ class OpenAIServingResponses(OpenAIServing):
         response = ResponsesResponse.from_request(
             request,
             sampling_params,
+            input_messages=input_messages,
+            output_messages=output_messages,
             model_name=model_name,
             created_time=created_time,
             output=output,

From 711e912946d23f4ccc1f554b1524c960553c5e28 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 19 Sep 2025 18:23:19 -0400
Subject: [PATCH 04/24] [Compile] Fix Compile Warning for Ignoring
 `MIN_BLOCK_PER_SM` (#25193)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/launch_bounds_utils.h                    | 38 +++++++++++++++++++
 .../activation_nvfp4_quant_fusion_kernels.cu  |  6 ++-
 csrc/quantization/fp4/nvfp4_experts_quant.cu  | 10 +++--
 csrc/quantization/fp4/nvfp4_quant_kernels.cu  |  8 ++--
 4 files changed, 53 insertions(+), 9 deletions(-)
 create mode 100644 csrc/launch_bounds_utils.h

diff --git a/csrc/launch_bounds_utils.h b/csrc/launch_bounds_utils.h
new file mode 100644
index 0000000000000..d5a89690111bc
--- /dev/null
+++ b/csrc/launch_bounds_utils.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+
+// maximum blocks per SM cap
+#ifndef VLLM_LAUNCH_BLOCKS_CAP
+  #define VLLM_LAUNCH_BLOCKS_CAP 4
+#endif
+
+// compile-time estimate of max threads per SM for launch bounds.
+#ifndef VLLM_MAX_THREADS_PER_SM
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+    #define VLLM_MAX_THREADS_PER_SM 1536
+  #else
+    #define VLLM_MAX_THREADS_PER_SM 2048
+  #endif
+#endif
+
+// compute the number of blocks per SM to request in __launch_bounds__
+#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
+#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
+  (((VAL) <= 0)                       \
+       ? 1                            \
+       : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
+#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
+  VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
+
+// runtime-time helper to compute blocks/SM
+static inline int vllm_runtime_blocks_per_sm(int block_threads) {
+  int device = -1;
+  cudaGetDevice(&device);
+  int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
+  cudaDeviceGetAttribute(&max_threads_per_sm,
+                         cudaDevAttrMaxThreadsPerMultiProcessor, device);
+  int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
+  return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
+}
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index 74fde23782ce5..7539f836ecf37 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -26,6 +26,7 @@
 #include "dispatch_utils.h"
 
 #include "cuda_utils.h"
+#include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"
 
 namespace vllm {
@@ -63,7 +64,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(1024, 4)
+__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
     silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                              float const* SFScale, uint32_t* out,
                              uint32_t* SFout) {
@@ -131,7 +132,8 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
   dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
-  int const numBlocksPerSM = 2048 / block.x;
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
   dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
 
   VLLM_DISPATCH_HALF_TYPES(
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index ce3ba2c19b9eb..6d385e0dd94e7 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -26,12 +26,13 @@
 #include "dispatch_utils.h"
 
 #include "nvfp4_utils.cuh"
+#include "launch_bounds_utils.h"
 
 namespace vllm {
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(512, 4)
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                     float const* SFScale, uint32_t* out, uint32_t* SFout,
                     uint32_t* input_offset_by_experts,
@@ -129,7 +130,7 @@ __global__ void __launch_bounds__(512, 4)
 
 // Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void __launch_bounds__(1024, 4)
+__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
     cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                     float const* SFScale, uint32_t* out, uint32_t* SFout,
                     uint32_t* input_offset_by_experts,
@@ -233,8 +234,9 @@ void quant_impl(void* output, void* output_scale, void* input,
   int const workSizePerRow = k / ELTS_PER_THREAD;
   int const totalWorkSize = m_topk * workSizePerRow;
   dim3 block(std::min(workSizePerRow, 512));
-  // Get number of blocks per SM (assume we can fully utilize the SM).
-  int const numBlocksPerSM = 2048 / block.x;
+  // Get number of blocks per SM
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
   dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
                      multiProcessorCount * numBlocksPerSM));
   while (grid.x <= multiProcessorCount && block.x > 64) {
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 0c1b9ef0664d7..5575ee8e4197e 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -26,13 +26,14 @@
 #include "dispatch_utils.h"
 
 #include "cuda_utils.h"
+#include "launch_bounds_utils.h"
 #include "nvfp4_utils.cuh"
 
 namespace vllm {
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(512, 4)
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
     cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
                     float const* SFScale, uint32_t* out, uint32_t* SFout) {
   using PackedVec = PackedVec<Type>;
@@ -75,8 +76,9 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
   // Grid, Block size.
   // Each thread converts 8 values.
   dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM (assume we can fully utilize the SM).
-  int const numBlocksPerSM = 2048 / block.x;
+  // Get number of blocks per SM
+  int const numBlocksPerSM =
+      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
   dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
 
   // Launch the cvt kernel.

From 431535b522c62f1422848e22dfd83bec2d04111a Mon Sep 17 00:00:00 2001
From: Zhiyu <zhiyuc@nvidia.com>
Date: Fri, 19 Sep 2025 15:40:33 -0700
Subject: [PATCH 05/24] Enable modelopt gemma3 nvfp4/fp8, make workflow more
 robust (#22771)

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../moe/test_modular_kernel_combinations.py   |  3 +-
 vllm/compilation/backends.py                  |  7 ++-
 vllm/config/model.py                          |  3 ++
 .../fused_moe/gpt_oss_triton_kernels_moe.py   |  4 +-
 .../layers/quantization/modelopt.py           | 53 +++++++++++++------
 vllm/model_executor/models/gemma3.py          | 16 ++++++
 vllm/model_executor/models/siglip.py          | 18 ++++++-
 7 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index 19c4301bd23d5..1c7e62d7aa4c8 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -11,7 +11,8 @@ import pytest
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.config import VllmConfig, current_platform, set_current_vllm_config
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 3cc0fc3106f5a..d6bdb31a3c630 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -31,8 +31,11 @@ logger = init_logger(__name__)
 
 def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
     if compilation_config.use_inductor:
-        if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer(
-                "2.8.0.dev"):
+        # Use standalone compile only if requested, version is new enough,
+        # and the symbol actually exists in this PyTorch build.
+        if (envs.VLLM_USE_STANDALONE_COMPILE
+                and is_torch_equal_or_newer("2.8.0.dev")
+                and hasattr(torch._inductor, "standalone_compile")):
             logger.debug("Using InductorStandaloneAdaptor")
             return InductorStandaloneAdaptor()
         else:
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 21457d3660a23..4e847922b61e6 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -964,6 +964,9 @@ class ModelConfig:
                 "modelopt",
                 "modelopt_fp4",
                 "petit_nvfp4",
+                # Ensure heavy backends are probed last to avoid unnecessary
+                # imports during override detection (e.g., MXFP4 imports Triton)
+                "mxfp4",
             ]
             quantization_methods = [
                 q for q in supported_quantization if q not in overrides
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 08a9b34a42457..f12d3807517ff 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -20,10 +20,10 @@ if has_triton_kernels():
         from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation,
                                                matmul_ogs)
         from triton_kernels.routing import routing
-    except ModuleNotFoundError:
+    except (ModuleNotFoundError, AttributeError) as e:
         logger.error(
             "Failed to import Triton kernels. Please make sure your triton "
-            "version is compatible.")
+            "version is compatible. Error: %s", e)
 
 
 def triton_kernel_moe_forward(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 7eac40825ac33..1083f398a3a20 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -160,6 +160,7 @@ class ModelOptFp8Config(QuantizationConfig):
     def is_layer_excluded(self, prefix: str) -> bool:
         """
         Check if a layer should be excluded from quantization.
+        Handles both exact matching (for fused layers) and substring matching.
 
         This method handles both regular models and multimodal models that use
         the language_model prefix. For multimodal models, it checks if the
@@ -168,11 +169,18 @@ class ModelOptFp8Config(QuantizationConfig):
         if self.exclude_modules is None:
             return False
 
-        # Check if any excluded module matches the prefix
+        # First check exact matching with fused layer support
+        if is_layer_skipped(prefix, self.exclude_modules,
+                            self.packed_modules_mapping):
+            return True
+
+        # Then check substring matching for patterns not caught by exact match
         for module in self.exclude_modules:
-            if (module in prefix
-                    or (prefix.startswith("language_model.")
-                        and module in prefix.removeprefix("language_model."))):
+            # Skip exact matches already handled above
+            if (module != prefix and
+                (module in prefix or
+                 (prefix.startswith("language_model.")
+                  and module in prefix.removeprefix("language_model.")))):
                 return True
         return False
 
@@ -180,9 +188,10 @@ class ModelOptFp8Config(QuantizationConfig):
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
-            if (is_layer_skipped(prefix, self.exclude_modules,
-                                 self.packed_modules_mapping)
-                    or self.is_layer_excluded(prefix)):
+            if self.is_layer_excluded(prefix):
+                return UnquantizedLinearMethod()
+            # Check if this is a vision model layer that should not be quantized
+            if ("vision_tower" in prefix or "vision_model" in prefix):
                 return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
         elif isinstance(layer, Attention):
@@ -778,22 +787,34 @@ class ModelOptNvFp4Config(QuantizationConfig):
         return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
                    exclude_modules, group_size)
 
-    def is_layer_excluded(self, prefix: str,
-                          exclude_modules: list[str]) -> bool:
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """
+        Check if a layer should be excluded from quantization.
+        Handles both exact matching (for fused layers) and pattern matching.
+        """
+        # First check exact matching with fused layer support
+        if is_layer_skipped(prefix, self.exclude_modules,
+                            self.packed_modules_mapping):
+            return True
+
+        # Check regex pattern matching for patterns not caught by exact match
         import regex as re
-        for pattern in exclude_modules:
-            regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
-            if re.fullmatch(regex_str, prefix):
-                return True
+        for pattern in self.exclude_modules:
+            # Skip patterns that would be caught by exact matching
+            if '*' in pattern or '.' in pattern:
+                regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
+                if re.fullmatch(regex_str, prefix):
+                    return True
         return False
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
-            if (is_layer_skipped(prefix, self.exclude_modules,
-                                 self.packed_modules_mapping)
-                    or self.is_layer_excluded(prefix, self.exclude_modules)):
+            if self.is_layer_excluded(prefix):
+                return UnquantizedLinearMethod()
+            # Check if this is a vision model layer that should not be quantized
+            if ("vision_tower" in prefix or "vision_model" in prefix):
                 return UnquantizedLinearMethod()
             return ModelOptNvFp4LinearMethod(self)
         elif isinstance(layer, Attention):
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 1263e3049a14a..7246308d59028 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -446,6 +446,22 @@ class Gemma3Model(nn.Module):
                 weight_loader(param, loaded_weight)
                 loaded_params.add(scale_name)
                 continue
+
+            # Check if this is a scale parameter that needs remapping first
+            if name.endswith(
+                (".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
+                # Try to remap the scale name first
+                remapped_name = maybe_remap_kv_scale_name(name, params_dict)
+                if remapped_name is not None and remapped_name in params_dict:
+                    # Successfully remapped, use the remapped name
+                    param = params_dict[remapped_name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(remapped_name)
+                    continue
+                # If remapping failed, continue with normal processing
+
             for (param_name, shard_name, shard_id) in stacked_params_mapping:
                 if shard_name not in name:
                     continue
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 3630f59f53e0a..eb49d6d2c3350 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -20,7 +20,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 
 from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
@@ -506,6 +507,21 @@ class SiglipVisionModel(nn.Module):
                 if layer_idx >= layer_count:
                     continue
 
+            # Check if this is a scale parameter that needs remapping first
+            if name.endswith(
+                (".k_scale", ".v_scale", ".q_scale", ".prob_scale")):
+                # Try to remap the scale name first
+                remapped_name = maybe_remap_kv_scale_name(name, params_dict)
+                if remapped_name is not None and remapped_name in params_dict:
+                    # Successfully remapped, use the remapped name
+                    param = params_dict[remapped_name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(remapped_name)
+                    continue
+                # If remapping failed, continue with normal processing
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue

From ee7a66dd9a5ead46f062502af33766f45076f05d Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Fri, 19 Sep 2025 15:59:41 -0700
Subject: [PATCH 06/24] allow disable flashinfer prefill (#25276)

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/envs.py                             | 3 +++
 vllm/v1/attention/backends/mla/common.py | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 19e2f8635275d..294a0b920fb78 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
     VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
     VLLM_NO_USAGE_STATS: bool = False
+    VLLM_DISABLE_FLASHINFER_PREFILL: bool = False
     VLLM_DO_NOT_TRACK: bool = False
     VLLM_USAGE_SOURCE: str = ""
     VLLM_CONFIGURE_LOGGING: int = 1
@@ -479,6 +480,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
     "VLLM_NO_USAGE_STATS":
     lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
+    "VLLM_DISABLE_FLASHINFER_PREFILL":
+    lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1",
     "VLLM_DO_NOT_TRACK":
     lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
         "DO_NOT_TRACK", None) or "0") == "1",
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index a990cb2f1a972..5b307810de930 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -412,7 +412,8 @@ M = TypeVar("M", bound=MLACommonMetadata)
 def use_flashinfer_prefill() -> bool:
     # For blackwell default to flashinfer prefill if it's available since
     # it is faster than FA2.
-    return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
+    return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available
+            and not envs.VLLM_USE_CUDNN_PREFILL
             and current_platform.is_device_capability(100))
 
 

From 14c1432789c9c1b66308481b2c37439d3ee6661a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 19 Sep 2025 16:34:07 -0700
Subject: [PATCH 07/24] [BugFix] Fix async scheduling CPU tensor race take 2
 (#25279)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 52 ++++++++++++++++++------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9d0f26266f0c5..3539f75612050 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1903,7 +1903,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
             }
-        elif (self.enable_prompt_embeds and get_pp_group().is_first_rank):
+        elif self.enable_prompt_embeds and get_pp_group().is_first_rank:
             # Get the input embeddings for the tokens that are not input embeds,
             # then put them into the appropriate positions.
             # TODO(qthequartermasterman): Since even when prompt embeds are
@@ -2125,6 +2125,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             invalid_req_indices,
         )
 
+    @contextmanager
+    def synchronize_input_prep(self):
+        if self.prepare_inputs_event is None:
+            yield
+            return
+
+        # Ensure prior step has finished with reused CPU tensors.
+        # This is required in the async scheduling case because
+        # the CPU->GPU transfer happens async.
+        self.prepare_inputs_event.synchronize()
+        try:
+            yield
+        finally:
+            self.prepare_inputs_event.record()
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -2132,33 +2147,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
         with record_function_or_nullcontext("Preprocess"):
-            self._update_states(scheduler_output)
-            if not scheduler_output.total_num_scheduled_tokens:
-                if not has_kv_transfer_group():
-                    # Return empty ModelRunnerOutput if there's no work to do.
-                    return EMPTY_MODEL_RUNNER_OUTPUT
-                return self.kv_connector_no_forward(scheduler_output,
-                                                    self.vllm_config)
-            if self.cache_config.kv_sharing_fast_prefill:
-                assert not self.input_batch.num_prompt_logprobs, (
-                    "--kv-sharing-fast-prefill produces incorrect logprobs for "
-                    "prompt tokens, tokens, please disable it when the requests"
-                    " need prompt logprobs")
+            with self.synchronize_input_prep():
+                # Update persistent batch states.
+                self._update_states(scheduler_output)
+
+                if not scheduler_output.total_num_scheduled_tokens:
+                    if not has_kv_transfer_group():
+                        # Return empty ModelRunnerOutput if no work to do.
+                        return EMPTY_MODEL_RUNNER_OUTPUT
+                    return self.kv_connector_no_forward(
+                        scheduler_output, self.vllm_config)
+                if self.cache_config.kv_sharing_fast_prefill:
+                    assert not self.input_batch.num_prompt_logprobs, (
+                        "--kv-sharing-fast-prefill produces incorrect "
+                        "logprobs for prompt tokens, tokens, please disable "
+                        "it when the requests need prompt logprobs")
 
-            if self.prepare_inputs_event is not None:
-                # Ensure prior step has finished with reused CPU tensors.
-                self.prepare_inputs_event.synchronize()
-            try:
                 # Prepare the decoder inputs.
                 (attn_metadata, logits_indices, spec_decode_metadata,
                  num_scheduled_tokens_np, spec_decode_common_attn_metadata,
                  max_query_len, ubatch_slices, num_tokens_after_padding
                  ) = self._prepare_inputs(scheduler_output)
 
-            finally:
-                if self.prepare_inputs_event is not None:
-                    self.prepare_inputs_event.record()
-
             (
                 num_scheduled_tokens,
                 num_input_tokens,

From 3da17c2cc2c2e1d750020e033535f942f156f64c Mon Sep 17 00:00:00 2001
From: Lucas Kabela <lucasakabela@gmail.com>
Date: Fri, 19 Sep 2025 17:27:21 -0700
Subject: [PATCH 08/24] [Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE
 #2969 (#25090)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
---
 tests/compile/test_basic_correctness.py | 16 ++--------------
 tests/compile/test_full_graph.py        |  4 +---
 vllm/compilation/wrapper.py             | 10 ++++------
 vllm/envs.py                            |  5 -----
 vllm/v1/worker/gpu_model_runner.py      |  4 +---
 vllm/worker/model_runner.py             |  8 +++-----
 6 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index fd2b1866e62e1..a1e5127ebeeb2 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -20,7 +20,6 @@ class TestSetting:
     tp_size: int
     attn_backend: str
     method: str
-    fullgraph: bool
 
 
 # we cannot afford testing the full Cartesian product
@@ -36,7 +35,6 @@ class TestSetting:
             tp_size=2,
             attn_backend="FLASH_ATTN",
             method="generate",
-            fullgraph=True,
         ),
         # llama model with quantization
         TestSetting(
@@ -46,7 +44,6 @@ class TestSetting:
             tp_size=1,
             attn_backend="FLASH_ATTN",
             method="generate",
-            fullgraph=True,
         ),
         # MoE model
         TestSetting(
@@ -56,7 +53,6 @@ class TestSetting:
             tp_size=2,
             attn_backend="FLASH_ATTN",
             method="generate",
-            fullgraph=True,
         ),
         # embedding model
         TestSetting(
@@ -73,7 +69,6 @@ class TestSetting:
             tp_size=1,
             attn_backend="FLASH_ATTN",
             method="encode",
-            fullgraph=True,
         ),
         TestSetting(
             model="BAAI/bge-base-en-v1.5",
@@ -82,7 +77,6 @@ class TestSetting:
             tp_size=1,
             attn_backend="FLASH_ATTN",
             method="encode",
-            fullgraph=True,
         ),
         # vision language model
         TestSetting(
@@ -92,7 +86,6 @@ class TestSetting:
             tp_size=1,
             attn_backend="FLASH_ATTN",
             method="generate_with_image",
-            fullgraph=False,
         ),
     ],
 )
@@ -109,9 +102,8 @@ def test_compile_correctness(
     tp_size = test_setting.tp_size
     attn_backend = test_setting.attn_backend
     method = test_setting.method
-    fullgraph = test_setting.fullgraph
-    if cuda_device_count_stateless() != pp_size * tp_size:
-        pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
+    if cuda_device_count_stateless() < pp_size * tp_size:
+        pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
                     f"{cuda_device_count_stateless()}")
 
     with monkeypatch.context() as m:
@@ -149,9 +141,5 @@ def test_compile_correctness(
         ]:
             all_args.append(final_args + [f"-O{level}"])
             all_envs.append({})
-            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-                # "DYNAMO_ONCE" will always use fullgraph
-                all_envs[-1][
-                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
 
         compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 84178344a5f36..3439a1b29038d 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -79,9 +79,7 @@ def test_full_graph(
 ):
     model, model_kwargs = model_info
 
-    with monkeypatch.context() as m:
-        # make sure these models can be captured in full graph mode
-        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+    with monkeypatch.context():
         print(f"MODEL={model}")
 
         run_model(optimization_level, model, model_kwargs)
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 96d4eae2ee9aa..930e4d27b410f 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -10,7 +10,6 @@ from typing import Callable, Optional
 
 import torch
 
-import vllm.envs as envs
 from vllm.config import (CompilationLevel, CUDAGraphMode,
                          get_current_vllm_config)
 from vllm.logger import init_logger
@@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher:
                 options = get_current_vllm_config(
                 ).compilation_config.inductor_compile_config
 
-            compiled_callable = torch.compile(
-                self.forward,
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend,
-                options=options)
+            compiled_callable = torch.compile(self.forward,
+                                              fullgraph=True,
+                                              backend=backend,
+                                              options=options)
 
         self.compiled_callable = compiled_callable
         self.original_code_object = self.__class__.forward.__code__
diff --git a/vllm/envs.py b/vllm/envs.py
index 294a0b920fb78..3991a789d80f6 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_FLASH_ATTN_VERSION":
     lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
 
-    # Internal flag to enable Dynamo fullgraph capture
-    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
-    lambda: bool(
-        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
-
     # Feature flag to enable/disable Inductor standalone compile.
     # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
     # enabled by default.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3539f75612050..dffadd1d769b7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             backend = self.vllm_config.compilation_config.init_backend(
                 self.vllm_config)
             compilation_counter.dynamo_as_is_count += 1
-            self.model.compile(
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
+            self.model.compile(fullgraph=True, backend=backend)
             return
         # for other compilation levels, cudagraph behavior is controlled by
         # CudagraphWraper and CudagraphDispatcher of vllm.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index c91c871766cff..f662f5a85eff6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -18,7 +18,6 @@ import torch.distributed
 import torch.nn as nn
 from tqdm.auto import tqdm
 
-import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
@@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
             backend = self.vllm_config.compilation_config.init_backend(
                 self.vllm_config)
             compilation_counter.dynamo_as_is_count += 1
-            self.model = torch.compile(
-                self.model,
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
+            self.model = torch.compile(self.model,
+                                       fullgraph=True,
+                                       backend=backend)
 
     def get_model(self) -> nn.Module:
         return self.model

From a36c675817867235d368faf7e8d81e0ed3333d9c Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Fri, 19 Sep 2025 21:33:25 -0300
Subject: [PATCH 09/24] Don't skip special tokens with hermes-style tool
 calling (#25281)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 .../openai/tool_parsers/hermes_tool_parser.py            | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index a6ce33af6bd00..e74c420da1d3c 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -98,6 +98,15 @@ class Hermes2ProToolParser(ToolParser):
             else:
                 return delta_text
 
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            # do not skip special tokens because the tool_call tokens are
+            # marked "special" in some models. Since they are skipped
+            # prior to the call to the tool parser, it breaks tool calling.
+            request.skip_special_tokens = False
+        return request
+
     def extract_tool_calls(
         self,
         model_output: str,

From c7e713616a53a097809609d5a7b536e8bfad4ab8 Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Fri, 19 Sep 2025 19:33:40 -0500
Subject: [PATCH 10/24] test: Remove vestigial skip for prompt embeds tests
 after landing v1 Prompt Embeds support (#25291)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 tests/entrypoints/openai/test_completion_with_prompt_embeds.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index 0e3fc82f0c033..176c1825530e4 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -14,9 +14,6 @@ from transformers import AutoConfig
 
 from ...utils import RemoteOpenAIServer
 
-pytest.skip("Skipping prompt_embeds test until V1 supports it.",
-            allow_module_level=True)
-
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 

From b8a287a0a8a035073d67b5101687e3a753dd02ac Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Fri, 19 Sep 2025 19:46:23 -0500
Subject: [PATCH 11/24] [docs] Prompt Embedding feature support (#25288)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 docs/features/README.md        | 34 ++++++++++++++++++----------------
 docs/features/prompt_embeds.md |  3 ---
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/docs/features/README.md b/docs/features/README.md
index d8e26ec02aecc..10cc448cc2ee3 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -36,22 +36,23 @@ th:not(:first-child) {
 }
 </style>
 
-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
-|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | |
-| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | |
-| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | |
-| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
-| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
-| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
-| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
-| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
-| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
-| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
-| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
-| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
+| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
+| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
+| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
+| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
+| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
+| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
+| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
+| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
+| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
+| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
+| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
 
 \* Chunked prefill and prefix caching are only applicable to last-token pooling.  
 <sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
@@ -76,3 +77,4 @@ th:not(:first-child) {
 | multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
 | best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ?     | [❌](gh-issue:25097) |
diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md
index 83993bd0140fa..f9d3c1fb6c23d 100644
--- a/docs/features/prompt_embeds.md
+++ b/docs/features/prompt_embeds.md
@@ -6,9 +6,6 @@ This page teaches you how to pass prompt embedding inputs to vLLM.
 
 The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
 
-!!! note
-    Prompt embeddings are currently only supported in the v0 engine.
-
 ## Offline Inference
 
 To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:

From 8945b001db3202f882108e50d16b6f9c5e6f01ed Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Fri, 19 Sep 2025 18:02:15 -0700
Subject: [PATCH 12/24] [torch.compile] CUDAGraph Inductor partition
 integration (#24281)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Signed-off-by: boyuanfeng <boyuan@meta.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/compile/piecewise/test_simple.py | 71 ++++++++++++++++++----
 tests/compile/silly_attention.py       |  1 +
 tests/compile/test_full_graph.py       | 59 +++++++++++++++++-
 tests/compile/test_fusion_attn.py      | 16 ++++-
 vllm/attention/layer.py                |  2 +
 vllm/compilation/backends.py           | 10 ++-
 vllm/compilation/decorators.py         | 57 ++++++++++++++++-
 vllm/config/compilation.py             | 84 ++++++++++++++++++++++----
 vllm/v1/cudagraph_dispatcher.py        | 12 ++--
 9 files changed, 280 insertions(+), 32 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 84f4945c82725..41055f431569c 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -15,6 +15,7 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                          VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import is_torch_equal_or_newer
 
 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
@@ -50,16 +51,21 @@ class SillyModel(nn.Module):
         return x
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
-@torch.inference_mode()
-def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
-
+def _run_simple_model(
+    splitting_ops,
+    use_inductor_graph_partition,
+    use_inductor,
+    expected_num_piecewise_graphs_seen,
+    expected_num_piecewise_capturable_graphs_seen,
+    expected_num_backend_compilations,
+    expected_num_cudagraph_captured,
+):
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE,
         use_cudagraph=True,
         use_inductor=use_inductor,
-        splitting_ops=["silly.attention"],
+        splitting_ops=splitting_ops,
+        use_inductor_graph_partition=use_inductor_graph_partition,
         cudagraph_copy_inputs=True,
         cudagraph_capture_sizes=[1, 2],
     ))
@@ -70,11 +76,11 @@ def test_simple_piecewise_compile(use_inductor):
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=
-            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+            num_piecewise_capturable_graphs_seen=
+            expected_num_piecewise_capturable_graphs_seen,
+            num_backend_compilations=expected_num_backend_compilations,
+            num_cudagraph_captured=expected_num_cudagraph_captured,
     ), set_forward_context(None,
                            vllm_config=vllm_config):  # background context
         # warm up with background context
@@ -104,3 +110,46 @@ def test_simple_piecewise_compile(use_inductor):
             output = model(input)
         assert get_global_counter() == 2
         assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
+
+
+@pytest.mark.parametrize("use_inductor", [True, False])
+@torch.inference_mode()
+def test_simple_piecewise_compile(use_inductor):
+    assert VLLM_USE_V1
+    _run_simple_model(
+        splitting_ops=["silly.attention"],
+        use_inductor_graph_partition=False,
+        use_inductor=use_inductor,
+        expected_num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
+        expected_num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
+        expected_num_backend_compilations=
+        3,  # num_piecewise_capturable_graphs_seen
+        expected_num_cudagraph_captured=
+        6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    )
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
+def test_simple_inductor_graph_partition(splitting_ops):
+    assert VLLM_USE_V1
+    if not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
+    _run_simple_model(
+        # inductor graph partition automatically resets splitting_ops
+        # to be an empty list
+        splitting_ops=splitting_ops,
+        use_inductor_graph_partition=True,
+        use_inductor=True,
+        expected_num_piecewise_graphs_seen=
+        1,  # since not splitting at fx graph level
+        expected_num_piecewise_capturable_graphs_seen=
+        1,  # since not splitting at fx graph level
+        expected_num_backend_compilations=
+        1,  # since not splitting at fx graph level
+        expected_num_cudagraph_captured=
+        6,  # inductor graph partition still captures 6
+        # graph, same as fx graph partition.
+    )
diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
index 13eb0bf4b1fa1..baedafbae99f1 100644
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@@ -60,4 +60,5 @@ direct_register_custom_op(
     mutates_args=["out"],
     fake_impl=silly_attention_fake,
     target_lib=silly_lib,
+    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 3439a1b29038d..870aa553ca628 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import logging
 import tempfile
 from typing import Any, Optional, Union
 
@@ -10,9 +11,13 @@ import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
+from tests.v1.attention.utils import _Backend
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CompilationLevel, PassConfig
+from vllm.attention.selector import global_force_attn_backend_context_manager
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         PassConfig)
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer
 
 from ..utils import create_new_process_for_each_test
 
@@ -105,6 +110,18 @@ def test_full_graph(
         (CompilationConfig(level=CompilationLevel.PIECEWISE,
                            debug_dump_path=tempfile.gettempdir()),
          ("facebook/opt-125m", {})),
+    ] + [
+        # graph inductor partition
+        (
+            CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
+                # inductor graph partition uses
+                # torch._C.Tag.cudagraph_unsafe to specify splitting ops
+                use_inductor_graph_partition=True,
+                cudagraph_mode=CUDAGraphMode.PIECEWISE,
+                compile_sizes=[1, 2]),
+            model) for model in models_list(all=False)
+        if is_torch_equal_or_newer("2.9.0.dev")
     ])
 # only test some of the models
 @create_new_process_for_each_test()
@@ -112,11 +129,51 @@ def test_custom_compile_config(
     compilation_config: CompilationConfig,
     model_info: tuple[str, dict[str, Any]],
 ):
+    if (compilation_config.use_inductor_graph_partition
+            and not is_torch_equal_or_newer("2.9.0.dev")):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
     model, model_kwargs = model_info
     print(f"MODEL={model}")
     run_model(compilation_config, model, model_kwargs)
 
 
+def test_inductor_graph_partition_attn_fusion(caplog_vllm):
+    if not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
+    model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+    compilation_config = CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_inductor_graph_partition=True,
+        cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        custom_ops=["+quant_fp8"],
+        pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
+    )
+    model_kwargs = {
+        "kv_cache_dtype": "fp8",
+        "max_model_len": 1024,
+    }
+    with caplog_vllm.at_level(
+            logging.DEBUG), global_force_attn_backend_context_manager(
+                _Backend.FLASHINFER):
+        run_model(compilation_config, model, model_kwargs)
+
+    try:
+        assert ("Fused quantization onto 48 attention nodes"
+                in caplog_vllm.text), caplog_vllm.text
+    except AssertionError:
+        # Note: this message is only triggered when the compilation goes
+        # through the custom pass. Due to multiple layers of cache on
+        # PyTorch side, the compilation of a graph may be cached such
+        # that custom pass directly goes through cache. In this case,
+        # we go through this branch and assert that the pass is not
+        # triggered.
+        assert "Fused quantization" not in caplog_vllm.text
+
+
 def run_model(compile_config: Union[int, CompilationConfig], model: str,
               model_kwargs: dict[str, Any]):
     prompts = [
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 6baf4bf83f499..022f183b31932 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp)
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -339,6 +340,10 @@ else:
 @pytest.mark.parametrize(
     "split_attention",
     [False, True] if current_platform.is_rocm() else [False])
+# TODO(boyuan): test inductor graph partition on rocm
+@pytest.mark.parametrize(
+    "use_inductor_graph_partition",
+    [False] if current_platform.is_rocm() else [False, True])
 @pytest.mark.skipif(not current_platform.is_cuda_alike(),
                     reason="Only test ROCm or CUDA")
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
@@ -352,9 +357,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                  dtype: torch.dtype, model_name: str,
                                  model_class: type[AttentionQuantPatternModel],
                                  backend: _Backend, split_attention: bool,
-                                 monkeypatch, dist_init):
+                                 use_inductor_graph_partition: bool,
+                                 monkeypatch, dist_init, caplog_vllm):
     """Test AttentionStaticQuantPattern fusion pass"""
 
+    if use_inductor_graph_partition and not is_torch_equal_or_newer(
+            "2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available "
+                    "in PyTorch 2.9+")
+
     monkeypatch.setenv("VLLM_USE_V1", "1")
     if split_attention:
         monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
@@ -372,6 +383,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             custom_ops=["+quant_fp8"],
+            use_inductor_graph_partition=use_inductor_graph_partition,
         ),
         cache_config=CacheConfig(cache_dtype="fp8"))
 
@@ -444,6 +456,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                        backend=test_backend,
                                        fullgraph=True)
         assert model_compiled.attn._o_scale_float is None
+
         result_fused_1 = model_compiled(q, k, v)
 
         if backend == _Backend.FLASHINFER:
@@ -453,6 +466,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
             # _o_scale_float
             assert model_compiled.attn._o_scale_float is not None
             result_fused_2 = model_compiled(q, k, v)
+
             assert model_compiled.attn._o_scale_float is not None
 
             torch.testing.assert_close(result_unfused,
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 8d5ebd93e063d..3d1269c0ecea8 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -577,6 +577,7 @@ direct_register_custom_op(
     mutates_args=[],
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
+    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
 
 
@@ -627,4 +628,5 @@ direct_register_custom_op(
     mutates_args=["output", "output_block_scale"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
+    tags=(torch._C.Tag.cudagraph_unsafe, ),
 )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index d6bdb31a3c630..17fc727b8fc70 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -329,6 +329,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
             global compilation_start_time
+
             compiled_graph_for_dynamic_shape = self.vllm_backend.\
                 compiler_manager.compile(
                 submod,
@@ -339,7 +340,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None)
             # Lazy import here to avoid circular import
-            from .cuda_graph import CUDAGraphOptions
             from .cuda_piecewise_backend import PiecewiseBackend
 
             piecewise_backend = PiecewiseBackend(
@@ -347,7 +347,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_dynamic_shape, self.vllm_backend)
 
-            if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                    and
+                    not self.compilation_config.use_inductor_graph_partition):
+                # We're using Dynamo-based piecewise splitting, so we wrap
+                # the whole subgraph with a static graph wrapper.
+                from .cuda_graph import CUDAGraphOptions
+
                 # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
                 # class) as platform dependent.
                 static_graph_wrapper_class = resolve_obj_by_qualname(
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 41d9fcb824b01..b7a6e23c1aa79 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import contextlib
 import inspect
 from typing import Callable, Optional, TypeVar, Union, overload
 from unittest.mock import patch
@@ -14,7 +15,7 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
-from vllm.utils import supports_dynamo
+from vllm.utils import resolve_obj_by_qualname, supports_dynamo
 
 from .monitor import start_monitoring_torch_compile
 
@@ -301,8 +302,11 @@ def _support_torch_compile(
 
             with patch.object(InliningInstructionTranslator, 'inline_call',
                               patched_inline_call), torch._dynamo.config.patch(
-                                  **dynamo_config_patches):
+                                  **dynamo_config_patches
+                              ), maybe_use_cudagraph_partition_wrapper(
+                                  self.vllm_config):
                 output = self.compiled_callable(*args, **kwargs)
+
             return output
 
         # usually, capturing the model once is enough, and then we can
@@ -314,3 +318,52 @@ def _support_torch_compile(
 
     cls.__call__ = __call__
     return cls
+
+
+@contextlib.contextmanager
+def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
+    """
+    Context manager to set/unset customized cudagraph partition wrappers.
+
+    If we're using Inductor-based graph partitioning, we currently have the
+    whole `fx.Graph` before Inductor lowering and and the piecewise
+    splitting happens after all graph passes and fusions. Here, we add
+    a custom hook for Inductor to wrap each partition with our static
+    graph wrapper class to maintain more control over static graph
+    capture and replay.
+    """
+    from vllm.config import CUDAGraphMode
+
+    compilation_config = vllm_config.compilation_config
+    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and compilation_config.use_inductor_graph_partition):
+        from torch._inductor.utils import CUDAGraphWrapperMetadata
+
+        from vllm.compilation.cuda_graph import CUDAGraphOptions
+        from vllm.platforms import current_platform
+
+        static_graph_wrapper_class = resolve_obj_by_qualname(
+            current_platform.get_static_graph_wrapper_cls())
+
+        def customized_cudagraph_wrapper(f,
+                                         metadata: CUDAGraphWrapperMetadata):
+            partition_id = metadata.partition_index
+            num_partitions = metadata.num_partitions
+            return static_graph_wrapper_class(
+                runnable=f,
+                vllm_config=vllm_config,
+                runtime_mode=CUDAGraphMode.PIECEWISE,
+                cudagraph_options=CUDAGraphOptions(
+                    debug_log_enable=partition_id == 0,
+                    gc_disable=partition_id != 0,
+                    weak_ref_output=partition_id == num_partitions - 1,
+                ))
+
+        torch._inductor.utils.set_customized_partition_wrappers(
+            customized_cudagraph_wrapper)
+
+    yield
+
+    if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and compilation_config.use_inductor_graph_partition):
+        torch._inductor.utils.set_customized_partition_wrappers(None)
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 3618f472e742d..22b38daf46c39 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -299,6 +299,26 @@ class CompilationConfig:
     minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
     """
 
+    use_inductor_graph_partition: bool = False
+    """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
+    This partition happens at inductor codegen time after all passes and fusions
+    are finished. It generates a single `call` function which wraps
+    cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops
+    outside the partition functions. For a graph with N cudagraph-unsafe ops
+    (e.g., Attention), there would be N+1 partitions. To mark an op as
+    cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when
+    register the custom op. 
+
+    This config supports both full cudagraph and piecewise cudagraph without
+    compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper
+    to each partition. For N+1 partitions, there would be N+1
+    CUDAGraph wrapper instances.
+
+    For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the
+    inductor `call` function in the model runner. The top-level full cudagraph
+    capture ignores all partitioning.
+    """
+
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
 
@@ -461,6 +481,12 @@ class CompilationConfig:
                                  "since full_cuda_graph is deprecated.")
             self.cudagraph_mode = CUDAGraphMode.FULL
 
+        if (self.use_inductor_graph_partition
+                and not is_torch_equal_or_newer("2.9.0.dev")):
+            raise ValueError("use_inductor_graph_partition is only "
+                             "supported with torch>=2.9.0.dev. Set "
+                             "use_inductor_graph_partition=False instead.")
+
     def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
@@ -540,19 +566,36 @@ class CompilationConfig:
             "set_splitting_ops_for_v1 should only be called when "
             "level is CompilationLevel.PIECEWISE")
 
+        use_inductor_graph_partition_msg = (
+            "When use_inductor_graph_partition=True, splitting_ops "
+            "are ignored and set to an empty list. Instead, "
+            "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
+            "used to annotate custom ops for graph partition.")
+
         if self.splitting_ops is None:
-            # NOTE: When using full cudagraph, instead of setting an empty
-            # list and capture the full cudagraph inside the flattened fx
-            # graph, we keep the piecewise fx graph structure but capture the
-            # full cudagraph outside the fx graph. This reduces some cpu
-            # overhead when the runtime batch_size is not cudagraph captured.
-            # see https://github.com/vllm-project/vllm/pull/20059 for details.
-            # make a copy to avoid mutating the class-level list via reference.
-            self.splitting_ops = list(self._attention_ops)
+            if self.use_inductor_graph_partition:
+                # When using inductor graph partition, we set splitting_ops
+                # to be empty and rely on torch._C.Tag.cudagraph_unsafe to
+                # annotate custom ops as splitting ops.
+                logger.warning_once(use_inductor_graph_partition_msg)
+                self.splitting_ops = []
+            else:
+                # NOTE: When using full cudagraph, instead of setting an empty
+                # list and capture the full cudagraph inside the flattened fx
+                # graph, we keep the piecewise fx graph structure but capture
+                # the full cudagraph outside the fx graph. This reduces some
+                # cpu overhead when the runtime batch_size is not cudagraph
+                # captured. see https://github.com/vllm-project/vllm/pull/20059
+                # for details. make a copy to avoid mutating the class-level
+                # list via reference.
+                self.splitting_ops = list(self._attention_ops)
         elif len(self.splitting_ops) == 0:
-            logger.warning_once("Using piecewise compilation with empty "
-                                "splitting_ops.")
-            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+            logger.warning_once(
+                "Using piecewise compilation with empty "
+                "splitting_ops and use_inductor_graph_partition"
+                f"={self.use_inductor_graph_partition}.")
+            if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE
+                    and not self.use_inductor_graph_partition):
                 logger.warning_once(
                     "When compilation level is piecewise with empty "
                     "splitting_ops, PIECEWISE cudagraph_mode will be "
@@ -562,7 +605,26 @@ class CompilationConfig:
                     "any problems.")
                 self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
+        elif self.use_inductor_graph_partition:
+            logger.warning_once(use_inductor_graph_partition_msg)
+            self.splitting_ops = []
 
     def splitting_ops_contain_attention(self) -> bool:
         return self.splitting_ops is not None and all(
             op in self.splitting_ops for op in self._attention_ops)
+
+    def is_attention_compiled_piecewise(self) -> bool:
+        use_fx_graph_piecewise_compilation = (
+            self.level == CompilationLevel.PIECEWISE
+            and self.splitting_ops_contain_attention())
+
+        inductor_used = (self.level == CompilationLevel.PIECEWISE
+                         and self.use_inductor) or (
+                             self.level >= CompilationLevel.DYNAMO_AS_IS
+                             and self.backend == "inductor")
+        use_inductor_piecewise_compilation = (
+            inductor_used and self.use_inductor_graph_partition
+            and not self.splitting_ops_contain_attention())
+
+        return use_fx_graph_piecewise_compilation or \
+            use_inductor_piecewise_compilation
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index d2db7dcb3f091..ea4fba8eeea6d 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
-from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
 from vllm.logger import init_logger
 
@@ -39,11 +39,15 @@ class CudagraphDispatcher:
             CUDAGraphMode.FULL: set(),
         }
 
-        assert not self.cudagraph_mode.requires_piecewise_compilation() or \
-            (self.compilation_config.level == CompilationLevel.PIECEWISE and
-             self.compilation_config.splitting_ops_contain_attention()), \
+        not_use_piecewise_compilation = (
+            not self.cudagraph_mode.requires_piecewise_compilation())
+
+        assert not_use_piecewise_compilation or \
+            self.compilation_config.is_attention_compiled_piecewise(), \
             "Compilation level should be CompilationLevel.PIECEWISE when "\
             "cudagraph_mode piecewise cudagraphs is used, "\
+            "and attention should be in splitting_ops or "\
+            "inductor splitting should be used. " \
             f"cudagraph_mode={self.cudagraph_mode}, "\
             f"compilation_level={self.compilation_config.level}, "\
             f"splitting_ops={self.compilation_config.splitting_ops}"

From a25ade5d473fc00107bd3950141d8211331d3377 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 19 Sep 2025 18:06:34 -0700
Subject: [PATCH 13/24] [BugFix] Ensure appropriate guards in destructors
 (#25284)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/compilation/collective_fusion.py                    | 2 +-
 .../kv_transfer/kv_connector/v1/nixl_connector.py        | 9 +++++----
 vllm/executor/executor_base.py                           | 3 ---
 vllm/v1/worker/gpu_worker.py                             | 3 ++-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 71274420c3426..0658b59a2e215 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1183,7 +1183,7 @@ class AllReduceFusionPass(VllmInductorPass):
         self.end_and_log()
 
     def __del__(self):
-        if self.disabled:
+        if getattr(self, "disabled", True):
             return
         if flashinfer_comm is not None:
             flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ff62f60e5a42c..d3a08af088c11 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -569,9 +569,10 @@ class NixlConnectorWorker:
 
     def __del__(self):
         """Cleanup background threads on destruction."""
-        self._handshake_initiation_executor.shutdown(wait=False)
-        if self._nixl_handshake_listener_t:
-            self._nixl_handshake_listener_t.join(timeout=0)
+        if executor := getattr(self, "_handshake_initiation_executor", None):
+            executor.shutdown(wait=False)
+        if listener_t := getattr(self, "_nixl_handshake_listener_t", None):
+            listener_t.join(timeout=0)
 
     @staticmethod
     def _nixl_handshake_listener(metadata: NixlAgentMetadata,
@@ -1379,4 +1380,4 @@ class NixlKVConnectorStats(KVConnectorStats):
         # TODO: reduce stats to a single value, calculate latency/throughput
         return {
             "num_successful_transfers": self.data["num_successful_transfers"]
-        }
\ No newline at end of file
+        }
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index d18bef1256af5..42aa8d14a21eb 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -235,9 +235,6 @@ class ExecutorBase(ABC):
         """Shutdown the executor."""
         self.collective_rpc("shutdown")
 
-    def __del__(self):
-        self.shutdown()
-
     async def execute_model_async(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 6855526583f04..8b1e1bb8f45ca 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -683,7 +683,8 @@ class Worker(WorkerBase):
             tensorizer_config=tensorizer_config, )
 
     def shutdown(self) -> None:
-        self.model_runner.ensure_kv_transfer_shutdown()
+        if runner := getattr(self, "model_runner", None):
+            runner.ensure_kv_transfer_shutdown()
 
 
 def init_worker_distributed_environment(

From 535d80056b72443e68a96c1e4a1049cd9a85587d Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 19 Sep 2025 19:02:38 -0700
Subject: [PATCH 14/24] [Misc] Support more collective_rpc return types
 (#25294)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_engine_core_client.py | 203 ++++++++++++++++++++-
 vllm/v1/serial_utils.py                    |  60 ++++--
 2 files changed, 246 insertions(+), 17 deletions(-)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 625a3470e8025..992c4e01386e5 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -8,7 +8,7 @@ import time
 import uuid
 from dataclasses import dataclass
 from threading import Thread
-from typing import Optional, Union
+from typing import Any, Optional, Union
 from unittest.mock import MagicMock
 
 import pytest
@@ -331,6 +331,46 @@ def echo_dc(
     return [val for _ in range(3)] if return_list else val
 
 
+# Dummy utility function to test dict serialization with custom types.
+def echo_dc_dict(
+    self,
+    msg: str,
+    return_dict: bool = False,
+) -> Union[MyDataclass, dict[str, MyDataclass]]:
+    print(f"echo dc dict util function called: {msg}")
+    val = None if msg is None else MyDataclass(msg)
+    # Return dict of dataclasses to verify support for returning dicts
+    # with custom value types.
+    if return_dict:
+        return {"key1": val, "key2": val, "key3": val}
+    else:
+        return val
+
+
+# Dummy utility function to test nested structures with custom types.
+def echo_dc_nested(
+    self,
+    msg: str,
+    structure_type: str = "list_of_dicts",
+) -> Any:
+    print(f"echo dc nested util function called: {msg}, "
+          f"structure: {structure_type}")
+    val = None if msg is None else MyDataclass(msg)
+
+    if structure_type == "list_of_dicts":  # noqa
+        # Return list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}]
+        return [{"a": val, "b": val}, {"c": val, "d": val}]
+    elif structure_type == "dict_of_lists":
+        # Return dict of lists: {"list1": [val, val], "list2": [val, val]}
+        return {"list1": [val, val], "list2": [val, val]}
+    elif structure_type == "deep_nested":
+        # Return deeply nested: {"outer": [{"inner": [val, val]},
+        # {"inner": [val]}]}
+        return {"outer": [{"inner": [val, val]}, {"inner": [val]}]}
+    else:
+        return val
+
+
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_util_method_custom_return(
         monkeypatch: pytest.MonkeyPatch):
@@ -384,6 +424,167 @@ async def test_engine_core_client_util_method_custom_return(
             client.shutdown()
 
 
+@pytest.mark.asyncio(loop_scope="function")
+async def test_engine_core_client_util_method_custom_dict_return(
+        monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Must set insecure serialization to allow returning custom types.
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo_dc_dict", echo_dc_dict, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=True,
+                asyncio_mode=True,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=True,
+            )
+
+        try:
+            # Test utility method returning custom / non-native data type.
+            core_client: AsyncMPClient = client
+
+            # Test single object return
+            result = await core_client.call_utility_async(
+                "echo_dc_dict", "testarg3", False)
+            assert isinstance(result,
+                              MyDataclass) and result.message == "testarg3"
+
+            # Test dict return with custom value types
+            result = await core_client.call_utility_async(
+                "echo_dc_dict", "testarg3", True)
+            assert isinstance(result, dict) and len(result) == 3
+            for key, val in result.items():
+                assert key in ["key1", "key2", "key3"]
+                assert isinstance(val,
+                                  MyDataclass) and val.message == "testarg3"
+
+            # Test returning dict with None values
+            result = await core_client.call_utility_async(
+                "echo_dc_dict", None, True)
+            assert isinstance(result, dict) and len(result) == 3
+            for key, val in result.items():
+                assert key in ["key1", "key2", "key3"]
+                assert val is None
+
+        finally:
+            client.shutdown()
+
+
+@pytest.mark.asyncio(loop_scope="function")
+async def test_engine_core_client_util_method_nested_structures(
+        monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Must set insecure serialization to allow returning custom types.
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo_dc_nested", echo_dc_nested, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+
+        with set_default_torch_num_threads(1):
+            client = EngineCoreClient.make_client(
+                multiprocess_mode=True,
+                asyncio_mode=True,
+                vllm_config=vllm_config,
+                executor_class=executor_class,
+                log_stats=True,
+            )
+
+        try:
+            core_client: AsyncMPClient = client
+
+            # Test list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}]
+            result = await core_client.call_utility_async(
+                "echo_dc_nested", "nested1", "list_of_dicts")
+            assert isinstance(result, list) and len(result) == 2
+            for i, item in enumerate(result):
+                assert isinstance(item, dict)
+                if i == 0:
+                    assert "a" in item and "b" in item
+                    assert isinstance(
+                        item["a"],
+                        MyDataclass) and item["a"].message == "nested1"
+                    assert isinstance(
+                        item["b"],
+                        MyDataclass) and item["b"].message == "nested1"
+                else:
+                    assert "c" in item and "d" in item
+                    assert isinstance(
+                        item["c"],
+                        MyDataclass) and item["c"].message == "nested1"
+                    assert isinstance(
+                        item["d"],
+                        MyDataclass) and item["d"].message == "nested1"
+
+            # Test dict of lists: {"list1": [val, val], "list2": [val, val]}
+            result = await core_client.call_utility_async(
+                "echo_dc_nested", "nested2", "dict_of_lists")
+            assert isinstance(result, dict) and len(result) == 2
+            assert "list1" in result and "list2" in result
+            for key, lst in result.items():
+                assert isinstance(lst, list) and len(lst) == 2
+                for item in lst:
+                    assert isinstance(
+                        item, MyDataclass) and item.message == "nested2"
+
+            # Test deeply nested: {"outer": [{"inner": [val, val]},
+            # {"inner": [val]}]}
+            result = await core_client.call_utility_async(
+                "echo_dc_nested", "nested3", "deep_nested")
+            assert isinstance(result, dict) and "outer" in result
+            outer_list = result["outer"]
+            assert isinstance(outer_list, list) and len(outer_list) == 2
+
+            # First dict in outer list should have "inner" with 2 items
+            inner_dict1 = outer_list[0]
+            assert isinstance(inner_dict1, dict) and "inner" in inner_dict1
+            inner_list1 = inner_dict1["inner"]
+            assert isinstance(inner_list1, list) and len(inner_list1) == 2
+            for item in inner_list1:
+                assert isinstance(item,
+                                  MyDataclass) and item.message == "nested3"
+
+            # Second dict in outer list should have "inner" with 1 item
+            inner_dict2 = outer_list[1]
+            assert isinstance(inner_dict2, dict) and "inner" in inner_dict2
+            inner_list2 = inner_dict2["inner"]
+            assert isinstance(inner_list2, list) and len(inner_list2) == 1
+            assert isinstance(
+                inner_list2[0],
+                MyDataclass) and inner_list2[0].message == "nested3"
+
+            # Test with None values in nested structures
+            result = await core_client.call_utility_async(
+                "echo_dc_nested", None, "list_of_dicts")
+            assert isinstance(result, list) and len(result) == 2
+            for item in result:
+                assert isinstance(item, dict)
+                for val in item.values():
+                    assert val is None
+
+        finally:
+            client.shutdown()
+
+
 @pytest.mark.parametrize(
     "multiprocessing_mode,publisher_config",
     [(True, "tcp"), (False, "inproc")],
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 50c1470c67edc..c812a2ec6427a 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -7,7 +7,7 @@ import pickle
 from collections.abc import Sequence
 from inspect import isclass
 from types import FunctionType
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import cloudpickle
 import msgspec
@@ -59,6 +59,42 @@ def _typestr(val: Any) -> Optional[tuple[str, str]]:
     return t.__module__, t.__qualname__
 
 
+def _encode_type_info_recursive(obj: Any) -> Any:
+    """Recursively encode type information for nested structures of
+    lists/dicts."""
+    if obj is None:
+        return None
+    if type(obj) is list:
+        return [_encode_type_info_recursive(item) for item in obj]
+    if type(obj) is dict:
+        return {k: _encode_type_info_recursive(v) for k, v in obj.items()}
+    return _typestr(obj)
+
+
+def _decode_type_info_recursive(
+        type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any],
+                                                        Any]) -> Any:
+    """Recursively decode type information for nested structures of
+    lists/dicts."""
+    if type_info is None:
+        return data
+    if isinstance(type_info, dict):
+        assert isinstance(data, dict)
+        return {
+            k: _decode_type_info_recursive(type_info[k], data[k], convert_fn)
+            for k in type_info
+        }
+    if isinstance(type_info, list) and (
+            # Exclude serialized tensors/numpy arrays.
+            len(type_info) != 2 or not isinstance(type_info[0], str)):
+        assert isinstance(data, list)
+        return [
+            _decode_type_info_recursive(ti, d, convert_fn)
+            for ti, d in zip(type_info, data)
+        ]
+    return convert_fn(type_info, data)
+
+
 class MsgpackEncoder:
     """Encoder with custom torch tensor and numpy array serialization.
 
@@ -129,12 +165,10 @@ class MsgpackEncoder:
             result = obj.result
             if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
                 return None, result
-            # Since utility results are not strongly typed, we also encode
-            # the type (or a list of types in the case it's a list) to
-            # help with correct msgspec deserialization.
-            return _typestr(result) if type(result) is not list else [
-                _typestr(v) for v in result
-            ], result
+            # Since utility results are not strongly typed, we recursively
+            # encode type information for nested structures of lists/dicts
+            # to help with correct msgspec deserialization.
+            return _encode_type_info_recursive(result), result
 
         if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             raise TypeError(f"Object of type {type(obj)} is not serializable"
@@ -288,15 +322,9 @@ class MsgpackDecoder:
             if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
                 raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must "
                                 "be set to use custom utility result types")
-            assert isinstance(result_type, list)
-            if len(result_type) == 2 and isinstance(result_type[0], str):
-                result = self._convert_result(result_type, result)
-            else:
-                assert isinstance(result, list)
-                result = [
-                    self._convert_result(rt, r)
-                    for rt, r in zip(result_type, result)
-                ]
+            # Use recursive decoding to handle nested structures
+            result = _decode_type_info_recursive(result_type, result,
+                                                 self._convert_result)
         return UtilityResult(result)
 
     def _convert_result(self, result_type: Sequence[str], result: Any) -> Any:

From c308501cb6a922af8c4183bd65be0094dd73de9a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 20 Sep 2025 04:11:03 +0100
Subject: [PATCH 15/24] Improve weight loading for encoder models in
 Transformers backend (#25289)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 28 ++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index f40a20dee63d7..3bd4d10316ec6 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -702,21 +702,45 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 class TransformersModel(TransformersBase):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
+            # Handle BERT-like models
+            "bert": "model",
             # Add `model.` prefix for base model checkpoints
             "": "model.",
-            # Remove `model.` from places it should not be
+            # Remove `model.` prefix if it was already there
             "model.model.": "model.",
+            # Pooling adapters will be adjacent to `model`
+            "model.pooler": "pooler",
             "model.score": "score",
+            # Classifier adapter's classifier layer is renamed to score
+            "model.classifier": "score",
+        },
+        orig_to_new_suffix={
+            # Replace legacy suffixes used for norms
+            ".gamma": ".weight",
+            ".beta": ".bias",
         })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
-        # Some encoder models have the position_ids buffer in the checkpoint
+        # After creating a pooling model, `pooler` will be duplicated.
+        # The one inside `model` comes from the Transformers modelling code.
+        # The one after `model` is an adapter from vLLM.
+        # We want to use the adapter so we nullify the original pooler.
+        if getattr(self.model, "pooler", None) is not None:
+            self.skip_prefixes.append("pooler.")
+            self.model.pooler = torch.nn.Identity()
+
+        # Some encoder models have the position_ids buffer in the checkpoint.
         # vLLM will always pass position_ids as an argument, so we skip loading
         # the buffer if it exists
         self.skip_substrs.append("position_ids")
 
+        # Some encoder models have the bias of the final classifier layer
+        # in the checkpoint. vLLM does not use this bias, so we skip loading
+        # it if it exists
+        self.skip_substrs.append("score.bias")
+
     def create_attention_instances(
             self, attn_type: AttentionType = AttentionType.DECODER):
         # TODO(hmellor): Better way to detect encoder models

From 36429096171ff8785645c40c662d859dddedd829 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Sat, 20 Sep 2025 05:18:13 +0200
Subject: [PATCH 16/24] [BUGFIX] GPTQ quantization compatibility for Qwen3 Next
 MOE models (AutoGPTQ and AutoRound-GPTQ) (#25268)

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/model_executor/models/qwen3_next.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 98749c160ba4d..ce917f92bd2e5 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -148,9 +148,11 @@ class Qwen3NextSparseMoeBlock(nn.Module):
 
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
-        # seems to avoid gate quantization.
-        # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4
-        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+        # seems to avoid gate quantization while AutoRound does.
+        if isinstance(
+                quant_config,
+            (GPTQConfig,
+             GPTQMarlinConfig)) and not quant_config.autoround_version:
             return None
         return quant_config
 

From b7f186bbb3101e97bb9ad42b7ffb3cdb4bb590fd Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 19 Sep 2025 21:28:31 -0700
Subject: [PATCH 17/24] [BugFix] Exclude self when checking for port collision
 (#25286)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/utils/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index d4013a69e99fe..fd1c0af31269c 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -987,8 +987,10 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]:
     if sys.platform.startswith("darwin"):
         return None
 
+    our_pid = os.getpid()
     for conn in psutil.net_connections():
-        if conn.laddr.port == port:
+        if conn.laddr.port == port and (conn.pid is not None
+                                        and conn.pid != our_pid):
             try:
                 return psutil.Process(conn.pid)
             except psutil.NoSuchProcess:

From 6c5f82e5aa87cd73ce03ce10fc44138f75ee1aea Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Fri, 19 Sep 2025 23:41:23 -0500
Subject: [PATCH 18/24] [BUG FIX][NON-CUDA]quick fix to avoid call
 cudagraph_unsafe in attention (#25298)

Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
---
 vllm/attention/layer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 3d1269c0ecea8..544a720524429 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -29,6 +29,10 @@ from vllm.utils import GiB_bytes, direct_register_custom_op
 
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
+try:
+    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
+except AttributeError:
+    tag_cudagraph_unsafe = ()  # type: ignore[assignment]
 
 
 def check_xformers_availability():
@@ -577,7 +581,7 @@ direct_register_custom_op(
     mutates_args=[],
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
-    tags=(torch._C.Tag.cudagraph_unsafe, ),
+    tags=tag_cudagraph_unsafe,
 )
 
 
@@ -628,5 +632,5 @@ direct_register_custom_op(
     mutates_args=["output", "output_block_scale"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
-    tags=(torch._C.Tag.cudagraph_unsafe, ),
+    tags=tag_cudagraph_unsafe,
 )

From f91480b2d44c263fb600b5cba5b0e6c7a195f742 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 20 Sep 2025 13:29:54 +0800
Subject: [PATCH 19/24] [Bugfix] fix tool call arguments is empty (#25223)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: xin.li <xin.li@daocloud.io>
---
 .../test_completion_with_function_calling.py  | 60 +++++++++++++++++++
 vllm/entrypoints/chat_utils.py                |  8 ++-
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 3649cefa9bf42..4355603fcd70b 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import datetime
 from typing import Union
 
 import openai  # use the official client for correctness check
@@ -284,3 +285,62 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
                 output.extend(chunk.choices[0].delta.tool_calls)
         for o in output:
             assert o.id is None or o.id == 'functions.get_current_weather:0'
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("arguments", ["{}", ''])
+async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str,
+                                 arguments: str):
+    # Step 1: Define a tool that requires no parameters
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_time",
+            "description":
+            "Get the current date and time. No parameters needed.",
+            "parameters": {
+                "type": "object",
+                "properties": {},  # No parameters
+                "required": []  # No required fields
+            }
+        }
+    }]
+    messages = [{"role": "user", "content": "What time is it now?"}]
+    # Step 2: Send user message and let model decide whether to call the tool
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto"  # Let model choose automatically
+    )
+
+    # Step 3: Check if model wants to call a tool
+    message = response.choices[0].message
+    if message.tool_calls:
+        # Get the first tool call
+        tool_call = message.tool_calls[0]
+        tool_name = tool_call.function.name
+        # Step 4: Execute the tool locally (no parameters)
+        if tool_name == "get_current_time":
+            # Test both empty string and "{}" for no-arg tool calls
+            tool_call.function.arguments = arguments
+            messages.append(message)
+            current_time = datetime.datetime.now()
+            result = current_time.isoformat()
+            messages.append({
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "content": result,
+            })
+            # Step 5: Send tool result back to model to continue conversation
+            final_response = await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+            )
+            # Output final natural language response
+            assert final_response.choices[0].message.content is not None
+
+    else:
+        # No tool called — just print model's direct reply
+        assert message.content is not None
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 00ef39f134653..c2c0ad74ef431 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1450,9 +1450,11 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
             and isinstance(message["tool_calls"], list)
         ):
             for item in message["tool_calls"]:
-                item["function"]["arguments"] = json.loads(
-                    item["function"]["arguments"]
-                )
+                # if arguments is None or empty string, set to {}
+                if content := item["function"].get("arguments"):
+                    item["function"]["arguments"] = json.loads(content)
+                else:
+                    item["function"]["arguments"] = {}
 
 
 def parse_chat_messages(

From c60e6137f0bf2034853919b3a9d705d7e06b93cf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 20 Sep 2025 13:30:22 +0800
Subject: [PATCH 20/24] [Optimization] Avoid repeated model architecture
 conversion for pooling models (#25261)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/model.py                      | 22 +++++++++++++++++++++-
 vllm/model_executor/model_loader/utils.py | 17 ++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 4e847922b61e6..921322bb475c5 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -322,8 +322,28 @@ class ModelConfig:
         factors.append(self.override_generation_config)
         factors.append(self.rope_scaling)
         factors.append(self.rope_theta)
+
         # hf_config can control how the model looks!
-        factors.append(self.hf_config.to_json_string())
+        try:
+            hf_config_json = self.hf_config.to_json_string(use_diff=False)
+        except TypeError:
+            from transformers import PretrainedConfig
+
+            from vllm.utils.jsontree import json_map_leaves
+
+            # Handle nested HF configs with unserializable values gracefully
+            hf_config_json = json.dumps(
+                json_map_leaves(
+                    lambda v: v.to_dict()
+                    if isinstance(v, PretrainedConfig) else str(v),
+                    self.hf_config.to_dict(),
+                ),
+                indent=2,
+                sort_keys=True,
+            ) + "\n"
+
+        factors.append(hf_config_json)
+
         str_factors = str(factors)
         assert_hashable(str_factors)
         return hashlib.sha256(str(factors).encode()).hexdigest()
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index bd1773c753a93..e007d431880eb 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -165,7 +165,11 @@ def device_loading_context(module: torch.nn.Module,
         # New parameters or parameters already on target device are untouched
 
 
-def get_model_architecture(
+_MODEL_ARCH_BY_HASH = dict[str, tuple[type[nn.Module], str]]()
+"""Caches the outputs of `_get_model_architecture`."""
+
+
+def _get_model_architecture(
         model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
 
@@ -209,6 +213,17 @@ def get_model_architecture(
     return model_cls, arch
 
 
+def get_model_architecture(
+        model_config: ModelConfig) -> tuple[type[nn.Module], str]:
+    key = model_config.compute_hash()
+    if key in _MODEL_ARCH_BY_HASH:
+        return _MODEL_ARCH_BY_HASH[key]
+
+    model_arch = _get_model_architecture(model_config)
+    _MODEL_ARCH_BY_HASH[key] = model_arch
+    return model_arch
+
+
 def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
     return get_model_architecture(model_config)[0]
 

From 9607d5eb449711b349d4c2bee0a9c94afcc7ed14 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 19 Sep 2025 23:43:59 -0700
Subject: [PATCH 21/24] [Hybrid Allocator] Support full attention with
 different hidden size  (#25101)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/core/test_kv_cache_utils.py | 118 +++++++++++++++++++---
 vllm/v1/core/kv_cache_utils.py       | 144 ++++++++++++++++++++-------
 vllm/v1/engine/core.py               |  16 ++-
 vllm/v1/kv_cache_interface.py        |  70 +++++++++++++
 vllm/v1/worker/gpu_model_runner.py   |  65 ++++++------
 vllm/v1/worker/utils.py              |   3 +-
 6 files changed, 324 insertions(+), 92 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 4bf6bbbfeae28..4cb7ed6ce3824 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -18,12 +18,14 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.kv_cache_utils import (
     BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
     estimate_max_model_len, generate_block_hash_extra_keys,
-    get_kv_cache_configs, get_max_concurrency_for_kv_cache_config,
-    get_request_block_hasher, hash_block_tokens, init_none_hash,
-    is_kv_cache_type_uniform, make_block_hash_with_group_id)
+    generate_scheduler_kv_cache_config, get_kv_cache_configs,
+    get_max_concurrency_for_kv_cache_config, get_request_block_hasher,
+    hash_block_tokens, init_none_hash, is_kv_cache_spec_uniform,
+    make_block_hash_with_group_id)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
-                                        KVCacheTensor, SlidingWindowSpec)
+                                        KVCacheTensor, SlidingWindowSpec,
+                                        UniformTypeKVCacheSpecs)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
@@ -927,36 +929,36 @@ def test_merge_kv_cache_spec():
     assert merged_layer_spec.sliding_window == 1
 
 
-def test_is_kv_cache_type_uniform():
+def test_is_kv_cache_spec_uniform():
     kv_cache_spec = {
         "layer_1": new_kv_cache_spec(num_kv_heads=32),
         "layer_2": new_kv_cache_spec(num_kv_heads=32),
     }
-    assert is_kv_cache_type_uniform(kv_cache_spec)
+    assert is_kv_cache_spec_uniform(kv_cache_spec)
 
     kv_cache_spec = {
         "layer_1": new_kv_cache_spec(num_kv_heads=32),
         "layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
     }
-    assert is_kv_cache_type_uniform(kv_cache_spec)
+    assert is_kv_cache_spec_uniform(kv_cache_spec)
 
     kv_cache_spec = {
         "layer_1": new_kv_cache_spec(num_kv_heads=32),
         "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
     }
-    assert not is_kv_cache_type_uniform(kv_cache_spec)
+    assert not is_kv_cache_spec_uniform(kv_cache_spec)
 
     kv_cache_spec = {
         "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
         "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
     }
-    assert is_kv_cache_type_uniform(kv_cache_spec)
+    assert is_kv_cache_spec_uniform(kv_cache_spec)
 
     kv_cache_spec = {
         "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1),
         "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2),
     }
-    assert not is_kv_cache_type_uniform(kv_cache_spec)
+    assert not is_kv_cache_spec_uniform(kv_cache_spec)
 
 
 @pytest.mark.parametrize(
@@ -1286,14 +1288,28 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
-    # different hidden size, unimplemented
+    # different hidden size
     kv_cache_specs_hybrid = {
         'layer_1': new_kv_cache_spec(head_size=128),
-        'layer_2': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(head_size=64),
     }
-    with pytest.raises(NotImplementedError):
-        get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid],
-                             [mem_per_block_per_layer * 2 * 32])[0]
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid],
+        [mem_per_block_per_layer * 3 * 32])[0]
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(size=mem_per_block_per_layer * 32 * 2,
+                          shared_by=["layer_1"]),
+            KVCacheTensor(size=mem_per_block_per_layer * 32,
+                          shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1", "layer_2"],
+                             UniformTypeKVCacheSpecs(
+                                 block_size=16,
+                                 kv_cache_specs=kv_cache_specs_hybrid))
+        ])
 
     # Test num_gpu_blocks_override
     vllm_config.cache_config.num_gpu_blocks_override = 16
@@ -1324,3 +1340,75 @@ def test_get_kv_cache_configs_attention_free():
             kv_cache_groups=[],
         )
     ]
+
+
+def test_generate_uniform_type_kv_cache_specs():
+    # All layers are full attention, can be merged
+    kv_cache_specs = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(head_size=128),
+    }
+    uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
+    assert uniform_spec == UniformTypeKVCacheSpecs(
+        block_size=16, kv_cache_specs=kv_cache_specs)
+
+    # Full attention + sliding window, cannot be merged
+    kv_cache_specs = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_sliding_window_spec(sliding_window=1),
+    }
+    uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
+    assert uniform_spec is None
+
+    # different order of full attention + sliding window, cannot be merged
+    kv_cache_specs = {
+        'layer_1': new_sliding_window_spec(sliding_window=1),
+        'layer_2': new_kv_cache_spec(),
+    }
+    uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
+    assert uniform_spec is None
+
+    # Same-size sliding window, can be merged
+    kv_cache_specs = {
+        'layer_1': new_sliding_window_spec(sliding_window=1),
+        'layer_2': new_sliding_window_spec(sliding_window=1, head_size=128),
+    }
+    uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
+    assert uniform_spec == UniformTypeKVCacheSpecs(
+        block_size=16, kv_cache_specs=kv_cache_specs)
+
+    # different block sizes, cannot be merged
+    kv_cache_specs = {
+        'layer_1': new_kv_cache_spec(block_size=16),
+        'layer_2': new_kv_cache_spec(block_size=32),
+    }
+    uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs)
+    assert uniform_spec is None
+
+
+def test_generate_scheduler_kv_cache_config():
+    kv_cache_specs = {
+        'layer_1': new_kv_cache_spec(),
+        'layer_2': new_kv_cache_spec(head_size=128),
+    }
+    kv_cache_configs = [
+        KVCacheConfig(
+            num_blocks=10,
+            kv_cache_tensors=[],
+            kv_cache_groups=[
+                KVCacheGroupSpec(['layer_1', 'layer_2'],
+                                 UniformTypeKVCacheSpecs(
+                                     block_size=16,
+                                     kv_cache_specs=kv_cache_specs)),
+            ],
+        )
+    ]
+    scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
+        kv_cache_configs)
+    assert scheduler_kv_cache_config == KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer_1', 'layer_2'], new_kv_cache_spec())
+        ],
+    )
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index bc2ec5e42ea20..3ccd00121f8ed 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """KV-Cache Utilities."""
 
+import copy
 import os
 from collections import defaultdict, deque
 from collections.abc import Iterable, Sequence
@@ -15,7 +16,8 @@ from vllm.utils import GiB_bytes, cdiv, sha256_cbor
 from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
-                                        KVCacheTensor, SlidingWindowSpec)
+                                        KVCacheTensor, SlidingWindowSpec,
+                                        UniformTypeKVCacheSpecs)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
@@ -750,7 +752,7 @@ def create_kv_cache_group_specs(
     return kv_cache_groups
 
 
-def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+def is_kv_cache_spec_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     """
     Whether all layers in the given KVCacheSpec have the same KV cache spec.
     Note that we regard FullAttentionSpec with and without sliding window as
@@ -793,6 +795,21 @@ def get_max_concurrency_for_kv_cache_config(
     return max_concurrency
 
 
+def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int:
+    """
+    Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
+    """
+    if vllm_config.cache_config.num_gpu_blocks_override is not None:
+        num_gpu_blocks_override = \
+            vllm_config.cache_config.num_gpu_blocks_override
+        logger.info(
+            "Overriding num_gpu_blocks=%d with "
+            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
+        num_blocks = num_gpu_blocks_override
+
+    return num_blocks
+
+
 def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
                    available_memory: int, page_size: int) -> int:
     """
@@ -806,13 +823,7 @@ def get_num_blocks(vllm_config: VllmConfig, num_layers: int,
     """
     num_blocks = int(available_memory // page_size // num_layers)
     num_blocks = max(num_blocks, 0)
-    if vllm_config.cache_config.num_gpu_blocks_override is not None:
-        num_gpu_blocks_override = \
-            vllm_config.cache_config.num_gpu_blocks_override
-        logger.info(
-            "Overriding num_gpu_blocks=%d with "
-            "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
-        num_blocks = num_gpu_blocks_override
+    num_blocks = may_override_num_blocks(vllm_config, num_blocks)
     return num_blocks
 
 
@@ -825,11 +836,11 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
     return page_sizes.pop()
 
 
-def _get_kv_cache_groups_uniform_type(
+def _get_kv_cache_groups_uniform_spec(
         kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]:
     """
-    Generates the KV cache configuration for a model with one type of KV cache.
-    Divide the available memory equally among all layers.
+    Generates the KV cache configuration for a model with the same KV cache 
+    spec for all layers.
 
     Args:
         kv_cache_specs: The kv cache spec of each attention layer in the model
@@ -842,6 +853,22 @@ def _get_kv_cache_groups_uniform_type(
                                        [list(kv_cache_specs.keys())])
 
 
+def _get_kv_cache_groups_uniform_type(
+        spec: UniformTypeKVCacheSpecs) -> list[KVCacheGroupSpec]:
+    """
+    Generates the KV cache configuration for a model with one type of KV cache
+    but different hidden sizes. All layers are merged into one group.
+
+    Args:
+        spec: The UniformTypeKVCacheSpecs of the model
+
+    Returns:
+        The generated KVCacheGroupSpecs
+    """
+
+    return [KVCacheGroupSpec(list(spec.kv_cache_specs.keys()), spec)]
+
+
 def is_kv_cache_page_size_uniform(
         kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     """
@@ -1000,28 +1027,45 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig,
         )
 
     # Determine how model runners should initialize the KV cache tensors.
-    # We will have group_size memory pools, each is shared by one layer from
-    # each group. As layers of different groups have different block table,
-    # they will use different parts of the shared Tensor.
-    # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
-    # (sw.1, padding) will be: (group_size = 2)
-    # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
-    # full.1, sw.2: share another Tensor with size=available_memory//2
-    group_size = max(len(group.layer_names) for group in kv_cache_groups)
+    if len(kv_cache_groups) == 1 and \
+        isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs):
+        # Special case: all layers have the same type of KV cache but with
+        # different hidden size. Allocate different amount of memory for each
+        # layer based on its hidden size.
+        num_blocks = available_memory // kv_cache_groups[
+            0].kv_cache_spec.page_size_bytes
+        num_blocks = may_override_num_blocks(vllm_config, num_blocks)
+        per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
+        kv_cache_tensors = [
+            KVCacheTensor(size=per_layer_specs[layer_name].page_size_bytes *
+                          num_blocks,
+                          shared_by=[layer_name])
+            for layer_name in kv_cache_groups[0].layer_names
+        ]
+    else:
+        # General case:
+        # We will have group_size memory pools, each is shared by one layer from
+        # each group. As layers of different groups have different block table,
+        # they will use different parts of the shared Tensor.
+        # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
+        # (sw.1, padding) will be: (group_size = 2)
+        # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
+        # full.1, sw.2: share another Tensor with size=available_memory//2
+        group_size = max(len(group.layer_names) for group in kv_cache_groups)
 
-    page_size = get_uniform_page_size(kv_cache_specs)
-    assert group_size > 0, "group_size must be greater than 0"
-    num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
-                                page_size)
-    per_memory_pool_size = page_size * num_blocks
-    kv_cache_tensors = []
-    for i in range(group_size):
-        shared_by = []
-        for j in range(len(kv_cache_groups)):
-            if i < len(kv_cache_groups[j].layer_names):
-                shared_by.append(kv_cache_groups[j].layer_names[i])
-        kv_cache_tensors.append(
-            KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by))
+        page_size = get_uniform_page_size(kv_cache_specs)
+        assert group_size > 0, "group_size must be greater than 0"
+        num_blocks = get_num_blocks(vllm_config, group_size, available_memory,
+                                    page_size)
+        kv_cache_tensors = []
+        for i in range(group_size):
+            shared_by = []
+            for j in range(len(kv_cache_groups)):
+                if i < len(kv_cache_groups[j].layer_names):
+                    shared_by.append(kv_cache_groups[j].layer_names[i])
+            kv_cache_tensors.append(
+                KVCacheTensor(size=page_size * num_blocks,
+                              shared_by=shared_by))
 
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,
@@ -1059,7 +1103,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
 
-    if is_kv_cache_type_uniform(kv_cache_spec):
+    if is_kv_cache_spec_uniform(kv_cache_spec):
         return
 
     logger.warning(
@@ -1097,7 +1141,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
                     attention_chunk_size=spec.attention_chunk_size,
                 )
 
-    if not is_kv_cache_type_uniform(kv_cache_spec):
+    if not is_kv_cache_spec_uniform(kv_cache_spec):
         raise ValueError("Hybrid KV cache manager is disabled but failed to "
                          "convert the KV cache specs to one unified type.")
 
@@ -1122,11 +1166,16 @@ def get_kv_cache_groups(
         # This returns an empty list to allow for the KVCacheManager to handle
         # attention free models.
         return []
-    elif is_kv_cache_type_uniform(kv_cache_spec):
+    elif is_kv_cache_spec_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
-        return _get_kv_cache_groups_uniform_type(kv_cache_spec)
+        return _get_kv_cache_groups_uniform_spec(kv_cache_spec)
+    elif uniform_spec := UniformTypeKVCacheSpecs.from_specs(kv_cache_spec):
+        # All layers need the same number of token slots (e.g., all layers are
+        # full attention, or all layers are sliding window attention with the
+        # same window size). Put all layers into one group.
+        return _get_kv_cache_groups_uniform_type(uniform_spec)
     elif is_kv_cache_page_size_uniform(kv_cache_spec):
         # Model contains multiple attention types, but KV cache of all layers
         # have the same physical memory per block per layer. Split the layers
@@ -1137,6 +1186,27 @@ def get_kv_cache_groups(
     raise NotImplementedError
 
 
+def generate_scheduler_kv_cache_config(
+        kv_cache_configs: list[KVCacheConfig]) -> KVCacheConfig:
+    """
+    Generate the KV cache configuration for the scheduler.
+    """
+    assert all([
+        cfg.num_blocks == kv_cache_configs[0].num_blocks
+        for cfg in kv_cache_configs
+    ])
+    # All workers have the same kv_cache_config except layer names, so use
+    # an arbitrary one to initialize the scheduler.
+    cfg = copy.deepcopy(kv_cache_configs[0])
+    for group in cfg.kv_cache_groups:
+        if isinstance(group.kv_cache_spec, UniformTypeKVCacheSpecs):
+            # All layers in the UniformTypeKVCacheSpecs have the same type,
+            # so use an arbitrary one to initialize the scheduler.
+            group.kv_cache_spec = next(
+                iter(group.kv_cache_spec.kv_cache_specs.values()))
+    return cfg
+
+
 def get_kv_cache_configs(vllm_config: VllmConfig,
                          kv_cache_specs: list[dict[str, KVCacheSpec]],
                          available_memory: list[int]) -> list[KVCacheConfig]:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a022e9c0d7058..a43042a5510a8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -29,7 +29,9 @@ from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
                         resolve_obj_by_qualname, set_process_title)
-from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs,
+from vllm.v1.core.kv_cache_utils import (BlockHash,
+                                         generate_scheduler_kv_cache_config,
+                                         get_kv_cache_configs,
                                          get_request_block_hasher,
                                          init_none_hash)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -196,16 +198,10 @@ class EngineCore:
 
         kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
                                                 available_gpu_memory)
-
-        # All workers have the same kv_cache_config except layer names, so use
-        # an arbitrary one to initialize the scheduler.
-        assert all([
-            cfg.num_blocks == kv_cache_configs[0].num_blocks
-            for cfg in kv_cache_configs
-        ])
-        num_gpu_blocks = kv_cache_configs[0].num_blocks
+        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(
+            kv_cache_configs)
+        num_gpu_blocks = scheduler_kv_cache_config.num_blocks
         num_cpu_blocks = 0
-        scheduler_kv_cache_config = kv_cache_configs[0]
 
         # Initialize kv cache and warmup the execution
         self.model_executor.initialize_from_config(kv_cache_configs)
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 0cf92a680a689..f72cc8f93a6c2 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -234,6 +234,76 @@ class CrossAttentionSpec(AttentionSpec):
         return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes
 
 
+@dataclass(frozen=True)
+class UniformTypeKVCacheSpecs(KVCacheSpec):
+    """
+    A KV cache spec for multiple layers with the same type of attention. Here,
+    same types means always need the same number of token slots. For example,
+    sliding window attentions with different window sizes are not the same type
+    and should not be merged into one UniformTypeKVCacheSpecs.
+    """
+    kv_cache_specs: dict[str, KVCacheSpec]
+
+    @property
+    def page_size_bytes(self) -> int:
+        return sum(spec.page_size_bytes
+                   for spec in self.kv_cache_specs.values())
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_num_pages = max(
+            cdiv(spec.max_memory_usage_bytes(vllm_config),
+                 spec.page_size_bytes)
+            for spec in self.kv_cache_specs.values())
+        return max_num_pages * self.page_size_bytes
+
+    @classmethod
+    def is_uniform_type(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> bool:
+        """
+        Whether all layers have the same type of KV cache spec.
+        """
+        block_sizes = set(spec.block_size for spec in kv_cache_specs.values())
+        if len(block_sizes) > 1:
+            # Different block sizes, not uniform.
+            return False
+        one_spec = next(iter(kv_cache_specs.values()))
+        if isinstance(one_spec, (FullAttentionSpec, CrossAttentionSpec)):
+            return all(
+                isinstance(spec, type(one_spec))
+                for spec in kv_cache_specs.values())
+        elif isinstance(one_spec, SlidingWindowSpec):
+            return all(
+                isinstance(spec, SlidingWindowSpec)
+                and spec.sliding_window == one_spec.sliding_window
+                for spec in kv_cache_specs.values())
+        elif isinstance(one_spec, ChunkedLocalAttentionSpec):
+            return all(
+                isinstance(spec, ChunkedLocalAttentionSpec)
+                and spec.attention_chunk_size == one_spec.attention_chunk_size
+                for spec in kv_cache_specs.values())
+        elif isinstance(one_spec, MambaSpec):
+            return all(
+                isinstance(spec, MambaSpec) and spec.num_speculative_blocks ==
+                one_spec.num_speculative_blocks
+                for spec in kv_cache_specs.values())
+        else:
+            # NOTE(Chen): Please add new branches for new KV cache spec types.
+            raise NotImplementedError(
+                f"Unsupported KV cache spec type: {type(one_spec)}")
+
+    @classmethod
+    def from_specs(cls, kv_cache_specs: dict[str,
+                                             KVCacheSpec]) -> Optional[Self]:
+        """
+        Return a SameTypeKVCacheSpecs object if all layers have the same type
+        of KV cache spec. Return None if not.
+        """
+        if cls.is_uniform_type(kv_cache_specs):
+            block_size = next(iter(kv_cache_specs.values())).block_size
+            return cls(block_size=block_size, kv_cache_specs=kv_cache_specs)
+        else:
+            return None
+
+
 @dataclass
 class KVCacheTensor:
     """
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index dffadd1d769b7..233df8f1b0e9b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -8,7 +8,7 @@ from collections import defaultdict
 from collections.abc import Iterator
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast
 
 import numpy as np
 import torch
@@ -74,7 +74,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         EncoderOnlyAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
-                                        MambaSpec, SlidingWindowSpec)
+                                        MambaSpec, SlidingWindowSpec,
+                                        UniformTypeKVCacheSpecs)
 # yapf: enable
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
                              DraftTokenIds, LogprobsLists, LogprobsTensors,
@@ -1187,7 +1188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     common_prefix_len = self._compute_cascade_attn_prefix_len(
                         num_scheduled_tokens,
                         num_common_prefix_blocks,
-                        kv_cache_group_spec.kv_cache_spec,
+                        attn_group.kv_cache_spec,
                         builder,
                     )
 
@@ -3453,12 +3454,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert len(self.attn_groups) == 0, \
             "Attention backends are already initialized"
 
-        def get_attn_backends_for_layers(
-                layer_names: list[str]
-        ) -> dict[type[AttentionBackend], list[str]]:
-            layers = get_layers_from_vllm_config(self.vllm_config,
-                                                 AttentionLayerBase,
-                                                 layer_names)
+        class AttentionGroupKey(NamedTuple):
+            attn_backend: type[AttentionBackend]
+            kv_cache_spec: KVCacheSpec
+
+        def get_attn_backends_for_group(
+            kv_cache_group_spec: KVCacheGroupSpec,
+        ) -> dict[AttentionGroupKey, list[str]]:
+            layers = get_layers_from_vllm_config(
+                self.vllm_config, AttentionLayerBase,
+                kv_cache_group_spec.layer_names)
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
             # Dedupe based on full class name; this is a bit safer than
@@ -3466,7 +3471,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # attention backend subclasses (e.g. ChunkedLocalAttention) unless
             # they are cached correctly, there will be different objects per
             # layer.
-            for layer_name in layer_names:
+            for layer_name in kv_cache_group_spec.layer_names:
                 attn_backend = layers[layer_name].get_attn_backend()
 
                 if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
@@ -3475,8 +3480,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         attn_backend,
                     )
 
-                key = attn_backend.full_cls_name()
-                attn_backends[key] = attn_backend
+                full_cls_name = attn_backend.full_cls_name()
+                layer_kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+                if isinstance(layer_kv_cache_spec, UniformTypeKVCacheSpecs):
+                    layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[
+                        layer_name]
+                key = (full_cls_name, layer_kv_cache_spec)
+                attn_backends[key] = AttentionGroupKey(attn_backend,
+                                                       layer_kv_cache_spec)
                 attn_backend_layers[key].append(layer_name)
             return {
                 attn_backends[k]: v
@@ -3484,11 +3495,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             }
 
         def create_attn_groups(
-            attn_backends_map: dict[AttentionBackend, list[str]],
-            kv_cache_spec: KVCacheSpec,
+            attn_backends_map: dict[AttentionGroupKey, list[str]],
         ) -> list[AttentionGroup]:
             attn_groups: list[AttentionGroup] = []
-            for attn_backend, layer_names in attn_backends_map.items():
+            for (attn_backend,
+                 kv_cache_spec), layer_names in attn_backends_map.items():
                 attn_metadata_builders = []
                 attn_metadata_builders.append(attn_backend.get_builder_cls()(
                     kv_cache_spec,
@@ -3506,16 +3517,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         ))
                 attn_group = AttentionGroup(attn_backend,
                                             attn_metadata_builders,
-                                            layer_names)
+                                            layer_names, kv_cache_spec)
                 attn_groups.append(attn_group)
             return attn_groups
 
         for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
-            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
-            attn_backends = get_attn_backends_for_layers(
-                kv_cache_group_spec.layer_names)
-            self.attn_groups.append(
-                create_attn_groups(attn_backends, kv_cache_spec))
+            attn_backends = get_attn_backends_for_group(kv_cache_group_spec)
+            self.attn_groups.append(create_attn_groups(attn_backends))
 
         # Calculate reorder batch threshold (if needed)
         self.calculate_reorder_batch_threshold()
@@ -3680,14 +3688,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _attn_group_iterator(self) -> Iterator[AttentionGroup]:
         return itertools.chain.from_iterable(self.attn_groups)
 
-    def _kv_cache_spec_attn_group_iterator(
-            self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
+    def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]:
         if not self.kv_cache_config.kv_cache_groups:
             return
-        for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
-            for attn_group in attn_groups:
-                yield self.kv_cache_config.kv_cache_groups[
-                    kv_cache_spec_id].kv_cache_spec, attn_group
+        for attn_groups in self.attn_groups:
+            yield from attn_groups
 
     def _reshape_kv_cache_tensors(
         self,
@@ -3707,7 +3712,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         """
         kv_caches: dict[str, torch.Tensor] = {}
         has_attn, has_mamba = False, False
-        for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
+        for group in self._kv_cache_spec_attn_group_iterator():
+            kv_cache_spec = group.kv_cache_spec
             attn_backend = group.backend
             for layer_name in group.layer_names:
                 if layer_name in self.runner_only_attn_layers:
@@ -3787,7 +3793,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_caches: The KV cache buffer of each layer.
         """
 
-        for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
+        for group in self._kv_cache_spec_attn_group_iterator():
+            kv_cache_spec = group.kv_cache_spec
             for layer_name in group.layer_names:
                 kv_cache = kv_caches[layer_name]
                 if (isinstance(kv_cache_spec, AttentionSpec)
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b76ac633892f3..021d18b2500f0 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -15,7 +15,7 @@ from vllm.multimodal.registry import MultiModalRegistry
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
-from vllm.v1.kv_cache_interface import KVCacheGroupSpec
+from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
 
 if TYPE_CHECKING:
     from vllm.attention.layer import Attention
@@ -132,6 +132,7 @@ class AttentionGroup:
     backend: type[AttentionBackend]
     metadata_builders: list[AttentionMetadataBuilder]
     layer_names: list[str]
+    kv_cache_spec: KVCacheSpec
 
     def get_metadata_builder(self,
                              ubatch_id: Optional[int] = None

From be874c020196080305baf988ed8c1c82047323be Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sat, 20 Sep 2025 00:04:05 -0700
Subject: [PATCH 22/24] [Bugfix] Fix Qwen3-VL-MoE weight loading for EP
 (#25300)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/qwen3_vl_moe.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 625f94cf7ad77..7912cf3ea52b0 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -122,9 +122,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
 
     def load_fused_expert_weights(self, name: str, params_dict: dict,
                                   loaded_weight: torch.Tensor, shard_id: str,
-                                  num_experts: int):
+                                  num_experts: int) -> bool:
         param = params_dict[name]
         weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+        loaded_local_expert = False
         for expert_id in range(num_experts):
             curr_expert_weight = loaded_weight[expert_id]
             success = weight_loader(param,
@@ -133,9 +134,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
                                     shard_id,
                                     expert_id,
                                     return_success=True)
-            if not success:
-                return False
-        return True
+            if success:
+                loaded_local_expert = True
+
+        return loaded_local_expert
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
@@ -345,4 +347,4 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
             for _ in range(self.deepstack_num_level)
         ] if self.use_deepstack else None
         self.visual_dim = config.vision_config.out_hidden_size
-        self.multiscale_dim = self.visual_dim * self.deepstack_num_level
\ No newline at end of file
+        self.multiscale_dim = self.visual_dim * self.deepstack_num_level

From 3d9a1d2de5091455bb2fbf6b21fc9188fd4612a4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 20 Sep 2025 15:14:35 +0800
Subject: [PATCH 23/24] [V1] Support `LLM.apply_model` (#18465)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/conftest.py                             | 12 +---
 tests/kernels/moe/test_mxfp4_moe.py           | 37 ++++++----
 .../multimodal/generation/test_qwen2_vl.py    | 46 ++++++------
 tests/models/quantization/test_awq.py         |  2 +-
 tests/quantization/test_compressed_tensors.py | 18 +++--
 tests/quantization/test_fp8.py                |  8 +--
 tests/quantization/test_gptq_dynamic.py       | 71 ++++++++++---------
 tests/quantization/test_lm_head.py            |  4 +-
 tests/quantization/test_modelopt.py           | 10 +--
 tests/quantization/test_ptpc_fp8.py           | 47 +++++++-----
 tests/quantization/test_quark.py              | 26 +++----
 .../test_register_quantization_config.py      | 17 +++--
 vllm/engine/llm_engine.py                     |  7 +-
 vllm/entrypoints/llm.py                       |  9 ++-
 vllm/executor/executor_base.py                | 33 +++++----
 vllm/v1/engine/llm_engine.py                  |  7 +-
 vllm/worker/worker_base.py                    |  9 ++-
 17 files changed, 194 insertions(+), 169 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3cd93f4ad3289..e8e95357ff5b9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -987,17 +987,7 @@ class VllmRunner:
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        if hasattr(self.llm.llm_engine, "model_executor"):
-            # This works either in V0 or in V1 with
-            # VLLM_ENABLE_V1_MULTIPROCESSING=0
-            executor = self.llm.llm_engine.model_executor
-            return executor.apply_model(func)
-
-        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
-        def _apply_model(self):
-            return func(self.get_model())
-
-        return self.llm.llm_engine.collective_rpc(_apply_model)
+        return self.llm.apply_model(func)
 
     def get_llm(self) -> LLM:
         return self.llm
diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py
index a3b8f07638d9a..61d3311cc1624 100644
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
@@ -1,21 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib
 import importlib.metadata
 from dataclasses import dataclass
+from importlib.util import find_spec
 from typing import Optional
 
 import pytest
 import torch
 from packaging import version
 
+from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
+    QuarkLinearMethod, QuarkW4A4MXFP4)
+from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
+    QuarkW4A4MXFp4MoEMethod)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 
-QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
-    "quark") is not None and version.parse(
-        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
 
 TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
 ) and current_platform.is_device_capability(100)
@@ -39,6 +42,12 @@ class ModelCase:
     tp: int
 
 
+@pytest.fixture(scope="function", autouse=True)
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
+
 @pytest.mark.parametrize('model_case', [
     ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
     ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
@@ -55,21 +64,19 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
                      tensor_parallel_size=model_case.tp,
                      load_format="dummy") as llm:
 
-        # TODO: llm.apply_model(check_model) currently relies on V0 internals.
-        # Re-enable this later.
-        # def check_model(model):
-        #     layer = model.model.layers[0]
+        def check_model(model):
+            layer = model.model.layers[0]
 
-        #     qkv_proj = layer.self_attn.qkv_proj
+            qkv_proj = layer.self_attn.qkv_proj
 
-        #     assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
-        #     assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
+            assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+            assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
 
-        #     assert isinstance(layer.mlp.experts.quant_method,
-        #                       QuarkW4A4MXFp4MoEMethod)
+            assert isinstance(layer.mlp.experts.quant_method,
+                              QuarkW4A4MXFp4MoEMethod)
 
-        # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
-        #     llm.apply_model(check_model)
+        if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
+            llm.apply_model(check_model)
 
         output = llm.generate_greedy("Today I am in the French Alps and",
                                      max_tokens=20)
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index a81f5e7ec8872..e56f4e4075be4 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -10,6 +10,7 @@ from PIL import Image
 
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
+from vllm.utils import set_default_torch_num_threads
 
 from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
                           PromptVideoInput, VllmRunner)
@@ -17,11 +18,9 @@ from ...utils import check_logprobs_close
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 models = ["Qwen/Qwen2-VL-2B-Instruct"]
@@ -126,9 +125,8 @@ def batch_make_image_embeddings(
             image_grid_thw_on_device = image_grid_thw.to(visual.device,
                                                          dtype=torch.int64)
             return visual(pixel_values_on_device,
-                          grid_thw=image_grid_thw_on_device)
+                          grid_thw=image_grid_thw_on_device).cpu()
 
-    # V1 Test: this calls a V0 internal.
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
@@ -210,7 +208,7 @@ def batch_make_video_embeddings(
             video_grid_thw_on_device = video_grid_thw.to(visual.device,
                                                          dtype=torch.int64)
             return visual(pixel_values_on_device,
-                          grid_thw=video_grid_thw_on_device)
+                          grid_thw=video_grid_thw_on_device).cpu()
 
     # V1 Test: this calls a V0 internal.
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
@@ -266,19 +264,22 @@ def run_embedding_input_test(
     processor = AutoProcessor.from_pretrained(model)
 
     # max_model_len should be greater than image_feature_size
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=3,
-                     dtype=dtype,
-                     limit_mm_per_prompt={
-                         "image": mm_limit,
-                         "video": mm_limit
-                     },
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
+    with set_default_torch_num_threads(1):
+        vllm_model = vllm_runner(
+            model,
+            runner="generate",
+            max_model_len=4000,
+            max_num_seqs=3,
+            dtype=dtype,
+            limit_mm_per_prompt={
+                "image": mm_limit,
+                "video": mm_limit
+            },
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+        )
 
+    with vllm_model:
         outputs_per_case_for_original_input = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
@@ -329,9 +330,8 @@ def run_embedding_input_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
-                                         size_factors, dtype: str,
-                                         max_tokens: int,
-                                         num_logprobs: int) -> None:
+                                         size_factors, dtype, max_tokens,
+                                         num_logprobs, monkeypatch) -> None:
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case: list[tuple[
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index bd696198931ff..7005e435ecf46 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -112,7 +112,7 @@ def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
                     monkeypatch) -> None:
 
     # Test V1: this test hangs during setup on single-scale input.
-    # TODO: fixure out why and re-enable this on V1.
+    # TODO: figure out why and re-enable this on V1.
     monkeypatch.setenv("VLLM_USE_V1", "0")
     run_awq_test(
         vllm_runner,
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 484f53246f349..b7949a488ad05 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -43,12 +43,9 @@ ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    if not current_platform.is_cpu():
-        monkeypatch.setenv('VLLM_USE_V1', '0')
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 @pytest.mark.parametrize(
@@ -176,10 +173,11 @@ def test_compressed_tensors_w8a8_logprobs(
 
     dtype = "bfloat16"
 
-    # skip language translation prompt for the static per tensor asym model
-    if (model_path ==
-            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
-        ):  # noqa: E501
+    # skip language translation prompt for the static per tensor models
+    if model_path in (
+            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    ):
         example_prompts = example_prompts[0:-1]
 
     with hf_runner(model_path, dtype=dtype) as hf_model:
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index d781f462b4ad7..db53061cf2d1a 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -60,8 +60,8 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
 
         def check_model(model):
@@ -104,8 +104,8 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index aea50e99c1dd5..00a5946ed0154 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -31,41 +31,46 @@ MODEL_QUANT = [
 @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
 def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
                            monkeypatch):
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
     linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
         GPTQLinearMethod)
 
-    for name, submodule in (vllm_model.llm.llm_engine.model_executor.
-                            driver_worker.model_runner.model.named_modules()):
-        if name == "lm_head":
-            assert isinstance(submodule.quant_method, linear_method_cls)
-        elif name == 'model.layers.0.self_attn.qkv_proj':
-            # The first layer is quantized using bits=4, group_size=128
-            # desc_act=True
-            assert isinstance(submodule.quant_method, linear_method_cls)
-            config = submodule.quant_method.quant_config
-            assert config.weight_bits == 4
-            assert config.group_size == 128
-            assert config.desc_act
-        elif name == 'model.layers.1.self_attn.qkv_proj':
-            # The second layer is quantized using bits=8, group_size=32
-            # desc_act=False
-            assert isinstance(submodule.quant_method, linear_method_cls)
-            config = submodule.quant_method.quant_config
-            assert get_dynamic_override(config, layer_name=name,
-                                        key="bits") == 8
-            assert get_dynamic_override(config,
-                                        layer_name=name,
-                                        key="group_size") == 32
-            assert not get_dynamic_override(
-                config, layer_name=name, key="desc_act")
-        elif (name == 'model.layers.2.self_attn.qkv_proj'
-              or name == 'model.layers.2.mlp.gate_up_proj'):
-            # All other layers (layer index >= 2) are not quantized
-            assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
+    with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm:
 
-    del vllm_model
+        def check_model(model):
+            for name, submodule in model.named_modules():
+                if name == "lm_head":
+                    assert isinstance(submodule.quant_method,
+                                      linear_method_cls)
+                elif name == 'model.layers.0.self_attn.qkv_proj':
+                    # The first layer is quantized using bits=4, group_size=128
+                    # desc_act=True
+                    assert isinstance(submodule.quant_method,
+                                      linear_method_cls)
+                    config = submodule.quant_method.quant_config
+                    assert config.weight_bits == 4
+                    assert config.group_size == 128
+                    assert config.desc_act
+                elif name == 'model.layers.1.self_attn.qkv_proj':
+                    # The second layer is quantized using bits=8, group_size=32
+                    # desc_act=False
+                    assert isinstance(submodule.quant_method,
+                                      linear_method_cls)
+                    config = submodule.quant_method.quant_config
+                    assert get_dynamic_override(config,
+                                                layer_name=name,
+                                                key="bits") == 8
+                    assert get_dynamic_override(config,
+                                                layer_name=name,
+                                                key="group_size") == 32
+                    assert not get_dynamic_override(
+                        config, layer_name=name, key="desc_act")
+                elif (name == 'model.layers.2.self_attn.qkv_proj'
+                      or name == 'model.layers.2.mlp.gate_up_proj'):
+                    # All other layers (layer index >= 2) are not quantized
+                    assert isinstance(submodule.quant_method,
+                                      UnquantizedLinearMethod)
+
+        llm.apply_model(check_model)
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index b24964a9d0a9f..e69d4ad349c38 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -29,8 +29,8 @@ def test_lm_head(
     lm_head_quantized: bool,
     monkeypatch,
 ) -> None:
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_id, dtype=torch.float16,
                      max_model_len=2048) as vllm_model:
 
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index c60a03f44baec..e7174be73626a 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -11,16 +11,12 @@ import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
-from vllm.platforms import current_platform
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    if not current_platform.is_cpu():
-        monkeypatch.setenv('VLLM_USE_V1', '0')
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 @pytest.mark.skipif(not is_quant_method_supported("modelopt"),
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index 5f78bc30504c0..088b68510cffa 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -13,6 +13,16 @@ from vllm.model_executor.layers.quantization.ptpc_fp8 import (
     PTPCFp8LinearMethod)
 from vllm.platforms import current_platform
 
+UNSUPPORTED_STR = (
+    "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
+    "support output dtype of bfloat16. torch.float16 is specified.")
+
+
+@pytest.fixture(scope="function", autouse=True)
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
 
 @pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
                     reason="PTPC FP8 is not supported on this GPU type.")
@@ -21,14 +31,22 @@ from vllm.platforms import current_platform
 @pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
 def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
-
     try:
-        with vllm_runner("facebook/opt-125m",
-                         dtype=dtype,
-                         quantization="ptpc_fp8",
-                         kv_cache_dtype=kv_cache_dtype) as llm:
+        llm = vllm_runner("facebook/opt-125m",
+                          dtype=dtype,
+                          quantization="ptpc_fp8",
+                          kv_cache_dtype=kv_cache_dtype)
+    except AssertionError as e:
+        if str(e) == UNSUPPORTED_STR:
+            # If the error message matches, the test passes
+            return
+        else:
+            # If the error message does not match, re-raise the exception
+            raise
 
-            model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+    with llm:
+
+        def check_model(model):
             fc1 = model.model.decoder.layers[0].fc1
             assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
             if kv_cache_dtype == "ptpc_fp8":
@@ -40,17 +58,8 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
             if current_platform.has_device_capability(94):
                 # For GPUs with hardware support, we keep weights in fp8
                 assert fc1.weight.dtype == torch.float8_e4m3fnuz
-            else:
-                pytest.skip()
 
-            output = llm.generate_greedy("Hello my name is", max_tokens=20)
-            assert output
-    except AssertionError as e:
-        if str(
-                e
-        ) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.":  # noqa: E501
-            # If the error message matches, the test passes
-            pass
-        else:
-            # If the error message does not match, re-raise the exception
-            raise
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        assert output
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index c09931971e6fb..930f4acb328fd 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -7,10 +7,10 @@ Run `pytest tests/quantization/test_quark.py`.
 See also `tests/kernels/moe/test_mxfp4_moe.py`.
 """
 
-import importlib
 import importlib.metadata
 import os
 from dataclasses import dataclass
+from importlib.util import find_spec
 
 import huggingface_hub
 import lm_eval
@@ -24,9 +24,8 @@ from vllm.platforms import current_platform
 
 from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
 
-QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
-    "quark") is not None and version.parse(
-        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
+    importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
 
 if QUARK_MXFP4_AVAILABLE:
     from quark.torch.export.nn.modules.realquantizer import (
@@ -43,11 +42,9 @@ except huggingface_hub.errors.RepositoryNotFoundError:
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+def enable_pickle(monkeypatch):
+    """`LLM.apply_model` requires pickling a function."""
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
 
 
 @pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
@@ -132,13 +129,12 @@ def test_quark_fp8_parity(vllm_runner):
     }
     with (vllm_runner(quark_model_id, **llm_kwargs) as
           quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
-        quark_model = (quark_handle.llm.llm_engine.model_executor.
-                       driver_worker.model_runner.model)
-        quark_state_dict = quark_model.state_dict()
 
-        fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker.
-                     model_runner.model)
-        fp8_state_dict = fp8_model.state_dict()
+        def get_state_dict(model):
+            return {k: v.cpu() for k, v in model.state_dict().items()}
+
+        quark_state_dict, = quark_handle.apply_model(get_state_dict)
+        fp8_state_dict, = fp8_handle.apply_model(get_state_dict)
 
     assert fp8_state_dict.keys() == quark_state_dict.keys()
 
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 84705e92c85bb..03fe59d7e3bff 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -105,18 +105,21 @@ def test_register_quantization_config():
                          ])
 def test_custom_quant(vllm_runner, model, monkeypatch):
     """Test infer with the custom quantization method."""
-    # vllm_runner.apply_model() relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    # `LLM.apply_model` requires pickling a function.
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
     with vllm_runner(model_name=model,
                      quantization="custom_quant",
                      enforce_eager=True) as llm:
 
-        model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
-        layer = model.model.layers[0]
-        qkv_proj = layer.self_attn.qkv_proj
+        def check_model(model):
+            layer = model.model.layers[0]
+            qkv_proj = layer.self_attn.qkv_proj
 
-        # Check the quantization method is FakeQuantLinearMethod
-        assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
+            # Check the quantization method is FakeQuantLinearMethod
+            assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
+
+        llm.apply_model(check_model)
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 708f3bbeeff15..014bc56bc8ece 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -13,6 +13,7 @@ from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast
 
 import torch
+import torch.nn as nn
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
@@ -55,6 +56,7 @@ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
 from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
 from vllm.version import __version__ as VLLM_VERSION
 from vllm.worker.model_runner_base import InputProcessingError
+from vllm.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
@@ -1817,13 +1819,16 @@ class LLMEngine:
         return sampling_params
 
     def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
+                       method: Union[str, Callable[[WorkerBase], _R]],
                        timeout: Optional[float] = None,
                        args: tuple = (),
                        kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
         return self.model_executor.collective_rpc(method, timeout, args,
                                                   kwargs)
 
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        return self.collective_rpc("apply_model", args=(func, ))
+
 
 if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
     from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e21bfce0ab085..f2282c40f7073 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -522,9 +522,14 @@ class LLM:
         """
         Run a function directly on the model inside each worker,
         returning the result for each of them.
+
+        !!! warning
+            To reduce the overhead of data transfer, avoid returning large
+            arrays or tensors from this method. If you must return them,
+            make sure you move them to CPU first to avoid taking up additional
+            VRAM!
         """
-        executor = self.llm_engine.model_executor
-        return executor.apply_model(func)
+        return self.llm_engine.apply_model(func)
 
     def _get_beam_search_lora_requests(
         self,
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 42aa8d14a21eb..b75b94ad0acc2 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -5,11 +5,10 @@ import asyncio
 import time
 from abc import ABC, abstractmethod
 from functools import cached_property
-from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
-                    Union)
+from typing import Any, Awaitable, Callable, List, Optional, Set, Union
 
 import torch.nn as nn
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 import vllm.platforms
 from vllm.config import VllmConfig
@@ -63,10 +62,10 @@ class ExecutorBase(ABC):
 
     @abstractmethod
     def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
+                       method: Union[str, Callable[[WorkerBase], _R]],
                        timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
         """
         Execute an RPC call on all workers.
 
@@ -91,7 +90,7 @@ class ExecutorBase(ABC):
         """
         raise NotImplementedError
 
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
+    def determine_num_available_blocks(self) -> tuple[int, int]:
         """Determine the number of available blocks for the GPU KV cache and
         swappable CPU KV cache.
 
@@ -99,9 +98,10 @@ class ExecutorBase(ABC):
         ExecutorBase may require modification of the result, e.g. to ensure the
         selected cache sizes are compatible with all workers.
 
-        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
-        are blocks that are "active" on the device and can be appended to.
-        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
+        Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where
+        `num_gpu_blocks` are blocks that are "active" on the device and can be
+        appended to. 
+        `num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be
         appended to.
         """
         results = self.collective_rpc("determine_num_available_blocks")
@@ -127,16 +127,15 @@ class ExecutorBase(ABC):
         self.collective_rpc("initialize_cache",
                             args=(num_gpu_blocks, num_cpu_blocks))
 
+    @deprecated("`llm_engine.model_executor.apply_model` will no longer work "
+                "in V1 Engine. Please replace with `llm_engine.apply_model` "
+                "and set `VLLM_ALLOW_INSECURE_SERIALIZATION=1`.")
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
         """
         Run a function directly on the model inside each worker,
         returning the result for each of them.
         """
-
-        def rpc_func(worker: WorkerBase) -> _R:
-            return func(worker.get_model())
-
-        return self.collective_rpc(rpc_func)
+        return self.collective_rpc("apply_model", args=(func, ))
 
     @cached_property  # Avoid unnecessary RPC calls
     def supported_tasks(self) -> tuple[SupportedTask, ...]:
@@ -308,8 +307,8 @@ class DistributedExecutorBase(ExecutorBase):
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[Any]:
         return self._run_workers(method, *args, **(kwargs or {}))
 
     @abstractmethod
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c93bfc35f0aeb..907656d1b24cb 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -5,6 +5,7 @@ from collections.abc import Mapping
 from copy import copy
 from typing import Any, Callable, Optional, Union
 
+import torch.nn as nn
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
@@ -33,6 +34,7 @@ from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
                                      StatLoggerFactory)
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
 from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
 
@@ -319,12 +321,15 @@ class LLMEngine:
         return self.engine_core.pin_lora(lora_id)
 
     def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
+                       method: Union[str, Callable[[WorkerBase], _R]],
                        timeout: Optional[float] = None,
                        args: tuple = (),
                        kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
+    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
+        return self.collective_rpc("apply_model", args=(func, ))
+
     def __del__(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index aa76d21f0fcaa..d0a56f6ff4637 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -5,7 +5,8 @@ import dataclasses
 import os
 import time
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, Type,
+                    TypeVar, Union)
 
 import cloudpickle
 import torch
@@ -28,6 +29,8 @@ from vllm.worker.model_runner_base import (BroadcastableModelInput,
 
 logger = init_logger(__name__)
 
+_R = TypeVar("_R")
+
 
 @warn_for_unimplemented_methods
 class WorkerBase:
@@ -70,6 +73,10 @@ class WorkerBase:
     def get_model(self) -> nn.Module:
         raise NotImplementedError
 
+    def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
+        """Apply a function on the model inside this worker."""
+        return fn(self.get_model())
+
     def load_model(self) -> None:
         """Load model onto target device."""
         raise NotImplementedError

From e08a3a3fdbdb5408f904a237b31ff2447a336b2f Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 20 Sep 2025 04:16:56 -0400
Subject: [PATCH 24/24] [CI Failure] Disable FlashInfer RoPE to unblock CI
 (#25299)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../model_executor/layers/rotary_embedding/base.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 3dc249ae9adb9..1c3576bee5392 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -6,8 +6,6 @@ from typing import Optional
 import torch
 
 from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer
 
 from .common import apply_rotary_emb_torch
 
@@ -32,13 +30,15 @@ class RotaryEmbedding(CustomOp):
         self.base = base
         self.is_neox_style = is_neox_style
         self.dtype = dtype
+        # TODO(mgoin): disabled for now due to failures
         # Flashinfer only supports head_size=64, 128, 256, 512.
         # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202
-        self.use_flashinfer = (self.enabled()
-                               and dtype in (torch.float16, torch.bfloat16)
-                               and current_platform.is_cuda()
-                               and has_flashinfer()
-                               and self.head_size in [64, 128, 256, 512])
+        # self.use_flashinfer = (self.enabled()
+        #                        and dtype in (torch.float16, torch.bfloat16)
+        #                        and current_platform.is_cuda()
+        #                        and has_flashinfer()
+        #                        and self.head_size in [64, 128, 256, 512])
+        self.use_flashinfer = False
 
         cache = self._compute_cos_sin_cache()
         if not self.use_flashinfer: