Merge branch 'main' into Add_support_for_openpangu_promoe_v2

Signed-off-by: yt0428 <51468697+yt0428@users.noreply.github.com>
2026-07-02 19:57:16 +08:00 · 2025-11-26 11:41:50 +08:00 · 2025-11-26 11:41:50 +08:00 · 28169a6fce
commit 28169a6fce
parent 315e3f654a 53d7f1f601
77 changed files with 1685 additions and 441 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -903,11 +903,12 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
+  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -13,7 +13,7 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0

      - name: Set up Python
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@ -12,7 +12,7 @@ jobs:
    timeout-minutes: 30

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6

      - uses: astral-sh/setup-uv@v7
        with:
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -16,7 +16,7 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
      with:
        python-version: "3.12"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -22,6 +22,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+#include "cutlass_extensions/common.hpp"

 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
 }

 template <typename OutType>
-void run_fp4_blockwise_scaled_group_mm(
+void run_fp4_blockwise_scaled_group_mm_sm100(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(

  auto can_implement_status = gemm_op.can_implement(args);
  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
+              "Failed to implement GEMM: status=", (int)can_implement_status);

  // Run the GEMM
  auto status = gemm_op.initialize(args, workspace.data_ptr());
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);

  status = gemm_op.run(args, workspace.data_ptr(), stream);
  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }

+void run_fp4_blockwise_scaled_group_mm_sm120(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  using ProblemShape =
+      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  // NOTE: For SM120 it seems templating the output type is not supported and
+  // we need to hardcode the output type to bfloat16
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_128, _128, _128>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
+      ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, MmaTileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutD*, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto,
+          FusionOperation>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
+          LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor c_strides1 =
+      torch::full({num_experts}, output.stride(0), options_int);
+  torch::Tensor a_strides1 =
+      torch::full({num_experts}, a.stride(0) * 2, options_int);
+  torch::Tensor b_strides1 =
+      torch::full({num_experts}, b.stride(1) * 2, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
+      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
+      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides1.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides1.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides1.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides1.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array =
+      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+  fusion_args.beta = 0.0f;
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM: status=", (int)can_implement_status);
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  if (version_num >= 120 && version_num < 130) {
+    run_fp4_blockwise_scaled_group_mm_sm120(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  if (version_num >= 100 && version_num < 120) {
+    run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
+      version_num, ". Required capability: 100 or 120");
+}
+
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
 #endif
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
  // Input validation
  CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
  CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
  } else {
+  #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 120 && version_num < 130) {
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
+          output.scalar_type());
+    }
+  #endif
    run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
  TORCH_CHECK_NOT_IMPLEMENTED(
      false,
      "No compiled cutlass_fp4_group_mm kernel, vLLM must "
-      "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
-      "12.8 or above.");
+      "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
+      "and CUDA 12.8 or above.");
 #endif
 }

--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
 constexpr auto INT = at::ScalarType::Int;
 constexpr auto UINT8 = at::ScalarType::Byte;

-void scaled_fp4_experts_quant_sm100a(
+void scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input_sf);
 #endif

-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-void scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
    torch::Tensor const& output_scale_offset_by_experts) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-  return scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return scaled_fp4_experts_quant_sm1xxa(
      output, output_scale, input, input_global_scale, input_offset_by_experts,
      output_scale_offset_by_experts);
 #endif
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias);
 #endif

-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
-    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
+#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
 void get_cutlass_moe_mm_data_caller(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, input_permutation,
                                 output_permutation, num_experts, n, k,
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
      false,
      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
      "CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }

 void get_cutlass_moe_mm_problem_sizes(
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                          problem_sizes2, num_experts, n, k,
                                          blockscale_offsets);
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
      false,
      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
      "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }

 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                      problem_sizes2, expert_num_tokens,
                                      num_local_experts, padded_m, n, k);
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
      false,
      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
      "for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }

 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.

 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -133,7 +133,7 @@ def main(args):
        tensor_parallel_size=args.tp,
        enable_chunked_prefill=args.enable_chunked_prefill,
        enforce_eager=args.enforce_eager,
-        gpu_memory_utilization=0.8,
+        gpu_memory_utilization=0.9,
        speculative_config=speculative_config,
        disable_log_stats=False,
        max_model_len=args.max_model_len,
--- a/tests/compile/fullgraph/test_simple.py
+++ b/tests/compile/fullgraph/test_simple.py
@ -55,7 +55,7 @@ class SillyModel(nn.Module):
 def _run_simple_model(
    splitting_ops,
    use_inductor_graph_partition,
-    use_inductor,
+    backend,
    expected_num_piecewise_graphs_seen,
    expected_num_piecewise_capturable_graphs_seen,
    expected_num_backend_compilations,
@ -64,7 +64,7 @@ def _run_simple_model(
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
-            use_inductor=use_inductor,
+            backend=backend,
            splitting_ops=splitting_ops,
            use_inductor_graph_partition=use_inductor_graph_partition,
            cudagraph_copy_inputs=True,
@ -124,14 +124,14 @@ def _run_simple_model(
        assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))


-@pytest.mark.parametrize("use_inductor", [True, False])
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
@torch.inference_mode()
@create_new_process_for_each_test("spawn")
-def test_simple_piecewise_compile(use_inductor):
+def test_simple_piecewise_compile(backend):
    _run_simple_model(
        splitting_ops=["silly::attention"],
        use_inductor_graph_partition=False,
-        use_inductor=use_inductor,
+        backend=backend,
        # 2 * num_layers + 1
        expected_num_piecewise_graphs_seen=5,
        # 1 + num_layers
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
    _run_simple_model(
        splitting_ops=["silly::attention"],
        use_inductor_graph_partition=True,
-        use_inductor=True,
+        backend="inductor",
        # Since not splitting at fx graph level
        expected_num_piecewise_graphs_seen=1,
        # Since not splitting at fx graph level
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -249,14 +249,13 @@ def test_compilation_config():
    args = parser.parse_args(
        [
            "-O",
-            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": false}',
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
        ]
    )
    assert (
        args.compilation_config.mode == 3
        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and not args.compilation_config.use_inductor
+        and args.compilation_config.backend == "eager"
    )

    # set to string form of a dict
@ -264,13 +263,13 @@ def test_compilation_config():
        [
            "--compilation-config="
            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": true}',
+            '"backend": "inductor"}',
        ]
    )
    assert (
        args.compilation_config.mode == 3
        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and args.compilation_config.use_inductor
+        and args.compilation_config.backend == "inductor"
    )


@ -278,8 +277,9 @@ def test_prefix_cache_default():
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
    args = parser.parse_args([])

+    # should be None by default (depends on model).
    engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.enable_prefix_caching, "prefix caching should default to on."
+    assert engine_args.enable_prefix_caching is None

    # with flag to turn it on.
    args = parser.parse_args(["--enable-prefix-caching"])
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@ -2,6 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.response_reasoning_item import (
    Content,
    ResponseReasoningItem,
@ -76,6 +79,18 @@ class TestResponsesUtils:
            == 'Hmm, the user has just started with a simple "Hello,"'
        )

+        tool_call_output = ResponseFunctionToolCallOutputItem(
+            id="temp_id",
+            type="function_call_output",
+            call_id="temp",
+            output="1234",
+            status="completed",
+        )
+        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        assert formatted_item["role"] == "tool"
+        assert formatted_item["content"] == "1234"
+        assert formatted_item["tool_call_id"] == "temp"
+
        item = ResponseReasoningItem(
            id="lol",
            summary=[],
--- a/tests/models/test_gguf_download.py
+++ b/tests/models/test_gguf_download.py
@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
+from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
+from vllm.model_executor.model_loader.weight_utils import download_gguf
+
+
+class TestGGUFDownload:
+    """Test GGUF model downloading functionality."""
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_single_file(self, mock_download):
+        """Test downloading a single GGUF file."""
+        # Setup mock
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return a single file
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else []
+            )
+
+            result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+            # Verify download_weights_from_hf was called with correct patterns
+            mock_download.assert_called_once_with(
+                model_name_or_path="unsloth/Qwen3-0.6B-GGUF",
+                cache_dir=None,
+                allow_patterns=[
+                    "*-IQ1_S.gguf",
+                    "*-IQ1_S-*.gguf",
+                    "*/*-IQ1_S.gguf",
+                    "*/*-IQ1_S-*.gguf",
+                ],
+                revision=None,
+                ignore_patterns=None,
+            )
+
+            # Verify result is the file path, not folder
+            assert result == f"{mock_folder}/model-IQ1_S.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_sharded_files(self, mock_download):
+        """Test downloading sharded GGUF files."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return sharded files
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [
+                    f"{mock_folder}/model-Q2_K-00001-of-00002.gguf",
+                    f"{mock_folder}/model-Q2_K-00002-of-00002.gguf",
+                ]
+                if "Q2_K" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            # Should return the first file after sorting
+            assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_subdir(self, mock_download):
+        """Test downloading GGUF files from subdirectory."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/Q2_K/model-Q2_K.gguf"]
+                if "Q2_K" in pattern or "**/*.gguf" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    @patch("glob.glob", return_value=[])
+    def test_download_gguf_no_files_found(self, mock_glob, mock_download):
+        """Test error when no GGUF files are found."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with pytest.raises(ValueError, match="Downloaded GGUF files not found"):
+            download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+
+class TestGGUFModelLoader:
+    """Test GGUFModelLoader class methods."""
+
+    @patch("os.path.isfile", return_value=True)
+    def test_prepare_weights_local_file(self, mock_isfile):
+        """Test _prepare_weights with local file."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "/path/to/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/path/to/model.gguf"
+        mock_isfile.assert_called_once_with("/path/to/model.gguf")
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with HTTPS URL."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "https://huggingface.co/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            url="https://huggingface.co/model.gguf"
+        )
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with repo_id/filename.gguf format."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf"
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=True)
+    @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_quant_type(
+        self,
+        mock_isfile,
+        mock_download_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_file_exists,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with repo_id:quant_type format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf"
+
+        model_config = ModelConfig(
+            model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B"
+        )
+        result = loader._prepare_weights(model_config)
+        # The actual result will be the downloaded file path from mock
+        assert result == "/downloaded/model-IQ1_S.gguf"
+        mock_download_gguf.assert_called_once_with(
+            "unsloth/Qwen3-0.6B-GGUF",
+            "IQ1_S",
+            cache_dir=None,
+            revision=None,
+            ignore_patterns=["original/**/*"],
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=False)
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_invalid_format(
+        self,
+        mock_isfile,
+        mock_check_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with invalid format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create ModelConfig with a valid repo_id to avoid validation errors
+        # Then test _prepare_weights with invalid format
+        model_config = ModelConfig(model="unsloth/Qwen3-0.6B")
+        # Manually set model to invalid format after creation
+        model_config.model = "invalid-format"
+        with pytest.raises(ValueError, match="Unrecognised GGUF reference"):
+            loader._prepare_weights(model_config)
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@ -1,11 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from unittest.mock import patch

+import pytest

 from vllm.transformers_utils.utils import (
    is_cloud_storage,
    is_gcs,
+    is_gguf,
+    is_remote_gguf,
    is_s3,
+    split_remote_gguf,
 )


@ -28,3 +34,143 @@ def test_is_cloud_storage():
    assert is_cloud_storage("s3://model-path/path-to-model")
    assert not is_cloud_storage("/unix/local/path")
    assert not is_cloud_storage("nfs://nfs-fqdn.local")
+
+
+class TestIsRemoteGGUF:
+    """Test is_remote_gguf utility function."""
+
+    def test_is_remote_gguf_with_colon_and_slash(self):
+        """Test is_remote_gguf with repo_id:quant_type format."""
+        # Valid quant types
+        assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_remote_gguf("user/repo:Q2_K")
+        assert is_remote_gguf("repo/model:Q4_K")
+        assert is_remote_gguf("repo/model:Q8_0")
+
+        # Invalid quant types should return False
+        assert not is_remote_gguf("repo/model:quant")
+        assert not is_remote_gguf("repo/model:INVALID")
+        assert not is_remote_gguf("repo/model:invalid_type")
+
+    def test_is_remote_gguf_without_colon(self):
+        """Test is_remote_gguf without colon."""
+        assert not is_remote_gguf("repo/model")
+        assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF")
+
+    def test_is_remote_gguf_without_slash(self):
+        """Test is_remote_gguf without slash."""
+        assert not is_remote_gguf("model.gguf")
+        # Even with valid quant_type, no slash means not remote GGUF
+        assert not is_remote_gguf("model:IQ1_S")
+        assert not is_remote_gguf("model:quant")
+
+    def test_is_remote_gguf_local_path(self):
+        """Test is_remote_gguf with local file path."""
+        assert not is_remote_gguf("/path/to/model.gguf")
+        assert not is_remote_gguf("./model.gguf")
+
+    def test_is_remote_gguf_with_path_object(self):
+        """Test is_remote_gguf with Path object."""
+        assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert not is_remote_gguf(Path("repo/model"))
+
+    def test_is_remote_gguf_with_http_https(self):
+        """Test is_remote_gguf with HTTP/HTTPS URLs."""
+        # HTTP/HTTPS URLs should return False even with valid quant_type
+        assert not is_remote_gguf("http://example.com/repo/model:IQ1_S")
+        assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K")
+        assert not is_remote_gguf("http://repo/model:Q4_K")
+        assert not is_remote_gguf("https://repo/model:Q8_0")
+
+    def test_is_remote_gguf_with_cloud_storage(self):
+        """Test is_remote_gguf with cloud storage paths."""
+        # Cloud storage paths should return False even with valid quant_type
+        assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_remote_gguf("gs://bucket/repo/model:Q2_K")
+        assert not is_remote_gguf("s3://repo/model:Q4_K")
+        assert not is_remote_gguf("gs://repo/model:Q8_0")
+
+
+class TestSplitRemoteGGUF:
+    """Test split_remote_gguf utility function."""
+
+    def test_split_remote_gguf_valid(self):
+        """Test split_remote_gguf with valid repo_id:quant_type format."""
+        repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+        repo_id, quant_type = split_remote_gguf("repo/model:Q2_K")
+        assert repo_id == "repo/model"
+        assert quant_type == "Q2_K"
+
+    def test_split_remote_gguf_with_path_object(self):
+        """Test split_remote_gguf with Path object."""
+        repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+    def test_split_remote_gguf_invalid(self):
+        """Test split_remote_gguf with invalid format."""
+        # Invalid format (no colon) - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model")
+
+        # Invalid quant type - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model:INVALID_TYPE")
+
+        # HTTP URL - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("http://repo/model:IQ1_S")
+
+        # Cloud storage - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("s3://bucket/repo/model:Q2_K")
+
+
+class TestIsGGUF:
+    """Test is_gguf utility function."""
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
+    def test_is_gguf_with_local_file(self, mock_check_gguf):
+        """Test is_gguf with local GGUF file."""
+        assert is_gguf("/path/to/model.gguf")
+        assert is_gguf("./model.gguf")
+
+    def test_is_gguf_with_remote_gguf(self):
+        """Test is_gguf with remote GGUF format."""
+        # Valid remote GGUF format (repo_id:quant_type with valid quant_type)
+        assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_gguf("repo/model:Q2_K")
+        assert is_gguf("repo/model:Q4_K")
+
+        # Invalid quant_type should return False
+        assert not is_gguf("repo/model:quant")
+        assert not is_gguf("repo/model:INVALID")
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    def test_is_gguf_false(self, mock_check_gguf):
+        """Test is_gguf returns False for non-GGUF models."""
+        assert not is_gguf("unsloth/Qwen3-0.6B")
+        assert not is_gguf("repo/model")
+        assert not is_gguf("model")
+
+    def test_is_gguf_edge_cases(self):
+        """Test is_gguf with edge cases."""
+        # Empty string
+        assert not is_gguf("")
+
+        # Only colon, no slash (even with valid quant_type)
+        assert not is_gguf("model:IQ1_S")
+
+        # Only slash, no colon
+        assert not is_gguf("repo/model")
+
+        # HTTP/HTTPS URLs
+        assert not is_gguf("http://repo/model:IQ1_S")
+        assert not is_gguf("https://repo/model:Q2_K")
+
+        # Cloud storage
+        assert not is_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_gguf("gs://bucket/repo/model:Q2_K")
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@ -166,7 +166,7 @@ def test_dict_args(parser):
        "--hf-overrides.key2.key4",
        "val3",
        # Test compile config and compilation mode
-        "-O.use_inductor=true",
+        "-O.use_inductor_graph_partition=true",
        "-O.backend",
        "custom",
        "-O1",
@ -219,7 +219,7 @@ def test_dict_args(parser):
    }
    assert parsed_args.compilation_config == {
        "mode": 1,
-        "use_inductor": True,
+        "use_inductor_graph_partition": True,
        "backend": "custom",
        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
    }
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -1248,7 +1248,9 @@ def test_allocate_with_lookahead():
    )

    # Test case 1: Requires additional lookahead tokens
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
    blocks = kv_cache_manager.allocate_slots(
        request,
        num_new_tokens=3,
@ -1257,7 +1259,9 @@ def test_allocate_with_lookahead():
    assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks

    # Test case 2: With precomputed blocks
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
    # required_blocks = ceil((3 + 2) /4) = 2
    blocks = kv_cache_manager.allocate_slots(
        request,
@ -1268,7 +1272,9 @@ def test_allocate_with_lookahead():

    # Test case 3: With precomputed blocks
    # required_blocks = ceil((3 + 4) / 4) = 2
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
    blocks = kv_cache_manager.allocate_slots(
        request,
        num_new_tokens=3,
@ -1495,7 +1501,8 @@ def test_get_kv_cache_config_one_worker():
            ),
        ],
    )
-    # different hidden size
+
+    # different hidden size but same type, use UniformTypeKVCacheSpecs
    kv_cache_specs_hybrid = {
        "layer_1": new_kv_cache_spec(head_size=128),
        "layer_2": new_kv_cache_spec(head_size=64),
@ -1519,6 +1526,40 @@ def test_get_kv_cache_config_one_worker():
        ],
    )

+    # Different hidden size and different type, align by different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=32),
+    }
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 32]
+    )[0]
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32, shared_by=["layer_1", "layer_2"]
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec(head_size=64)),
+            KVCacheGroupSpec(
+                ["layer_2"], new_sliding_window_spec(head_size=32, block_size=32)
+            ),
+        ],
+    )
+
+    # different hidden size that cannot be aligned by using different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=96),
+    }
+
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_configs(
+            vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
+        )[0]
+
    # Test num_gpu_blocks_override
    vllm_config.cache_config.num_gpu_blocks_override = 16
    kv_cache_config_override_blocks = get_kv_cache_configs(
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@ -134,6 +134,7 @@ def test_prefill(hash_fn):
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    # Complete 3 blocks (48 tokens)
@ -256,6 +257,7 @@ def test_prefill_hybrid_model():
        make_kv_cache_config_hybrid_model(block_size, 21),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    hash_fn = sha256
@ -416,6 +418,7 @@ def test_prefill_plp():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )
    # the default hash function is sha256
    hash_fn = sha256
@ -523,6 +526,7 @@ def test_decode():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    # Complete 3 blocks (48 tokens)
@ -585,6 +589,7 @@ def test_evict():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    last_token_id = 5 * 16 + 7
@ -643,6 +648,7 @@ def test_hash_block_correct_reuse():
        make_kv_cache_config(16, 2),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    # Allocate 1 block and cache it.
@ -683,6 +689,7 @@ def test_computed_blocks_not_evicted():
        make_kv_cache_config(block_size, 3),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    # Allocate a block and cache it.
@ -741,6 +748,7 @@ def test_basic_prefix_caching_disabled():
        make_kv_cache_config(block_size, 5),
        max_model_len=8192,
        enable_caching=False,
+        hash_block_size=block_size,
    )

    req1 = make_request(
@ -790,6 +798,7 @@ def test_cache_blocks(hash_fn):
    block_pool = BlockPool(
        num_gpu_blocks=5,
        enable_caching=True,
+        hash_block_size=block_size,
    )
    # Req:
    #  Block 0: [0, 1, 2, 3]
@ -833,7 +842,9 @@ def test_cache_blocks_multi_group():
    This tests that blocks are cached correctly for different kv cache groups.
    """
    block_size = 4
-    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=10, enable_caching=True, hash_block_size=block_size
+    )

    # Req:
    #  Block 0/4: [0, 1, 2, 3]
@ -921,6 +932,7 @@ def test_mm_prefix_caching():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@ -1020,6 +1032,7 @@ def test_cache_key_salting():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    # 3 complete blocks and an incomplete block with 11 tokens.
@ -1101,6 +1114,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )
    # Complete 3 blocks (48 tokens)
    # | Common-0 | Common-1 | Common-2 | ... |
@ -1173,6 +1187,7 @@ def test_reset_prefix_cache():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
    )

    full_block_token_ids = [i for i in range(3) for _ in range(16)]
@ -1213,6 +1228,7 @@ def test_prefix_cache_stats_disabled():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
+        hash_block_size=block_size,
        log_stats=False,  # Disable logging stats
    )
    assert manager.prefix_cache_stats is None
@ -1232,7 +1248,7 @@ def test_prefix_cache_stats_disabled():


 def test_maybe_evict_cached_block():
-    pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    pool = BlockPool(num_gpu_blocks=4, enable_caching=True, hash_block_size=16)
    block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000)
    block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000)
    block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000)
@ -1293,6 +1309,7 @@ def test_kv_cache_events(blocks_to_cache: int):
        max_model_len=8192,
        enable_caching=True,
        enable_kv_cache_events=True,
+        hash_block_size=block_size,
    )

    num_tokens = block_size * blocks_to_cache
@ -1351,6 +1368,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int):
        max_model_len=8192,
        enable_caching=True,
        enable_kv_cache_events=True,
+        hash_block_size=block_size,
    )

    # Test with LoRA request
@ -1405,6 +1423,7 @@ def test_eagle_enabled_removes_last_block():
        max_model_len=8192,
        enable_caching=True,
        use_eagle=True,
+        hash_block_size=block_size,
    )

    # Request with 3 full blocks (48 tokens)
@ -1437,6 +1456,7 @@ def test_eagle_with_partial_blocks():
        max_model_len=8192,
        enable_caching=True,
        use_eagle=True,
+        hash_block_size=block_size,
    )
    # 2 full blocks + 5 tokens (non-divisible length)
    token_ids = [0] * (2 * block_size + 5)
@ -1476,6 +1496,7 @@ def test_eagle_with_sliding_window():
        max_model_len=8192,
        enable_caching=True,
        use_eagle=True,
+        hash_block_size=block_size,
    )

    # 2 full blocks + 5 tokens (non-divisible length)
@ -1522,6 +1543,76 @@ def test_eagle_with_sliding_window():
    assert num_tokens == 0


+def test_different_block_size():
+    block_size = 16
+    # full attention and sliding window attention layers have the same page size:
+    # (32 tokens/block * float16 token, vs. 16 tokens/block * float32 token)
+    kv_cache_config = KVCacheConfig(
+        num_blocks=100,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(
+                    block_size,
+                    1,
+                    1,
+                    torch.float32,
+                    sliding_window=2 * block_size,
+                ),
+            ),
+        ],
+    )
+    manager = KVCacheManager(
+        kv_cache_config=kv_cache_config,
+        max_model_len=8192,
+        enable_caching=True,
+        hash_block_size=block_size,
+    )
+
+    # 10 blocks of 16 tokens each. Token ids are not strictly aligned for each block.
+    common_token_ids = [i for i in range(10) for _ in range(block_size)]
+
+    req0 = make_request("0", common_token_ids, block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert not computed_blocks.blocks[1]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(
+        req0, 7 * block_size, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11])
+    req1 = make_request("1", common_token_ids[: 7 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    req2 = make_request("2", common_token_ids[: 6 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    # Evict some blocks to make sliding window cache hit length 5*16
+    # But should return 4 * 16 because full attention cache hit length must be
+    # a multiple of 32
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[6], 1), 11
+    )
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[5], 1), 10
+    )
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 2
+    assert len(computed_blocks.blocks[1]) == 4
+    assert num_computed_tokens == 4 * 16
+
+
 def test_block_lookup_cache_single_block_per_key():
    cache = BlockHashToBlockMap()
    key0 = BlockHashWithGroupId(b"hash0")
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@ -41,7 +41,9 @@ def test_chunked_local_attention_possible_cached_prefix():
        attention_chunk_size=4,
    )

-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
    manager = get_chunked_local_attention_manager(
        chunked_local_attention_spec, block_pool
    )
@ -70,6 +72,7 @@ def test_chunked_local_attention_possible_cached_prefix():
            block_pool=block_pool,
            kv_cache_spec=chunked_local_attention_spec,
            use_eagle=False,
+            alignment_tokens=block_size,
        )[0]
        assert len(computed_blocks) == expect_length

@ -111,7 +114,9 @@ def test_sliding_window_possible_cached_prefix():
        sliding_window=4,
    )

-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
    manager = get_sliding_window_manager(sliding_window_spec, block_pool)

    def run_one_case(block_is_cached, expect_length):
@ -138,6 +143,7 @@ def test_sliding_window_possible_cached_prefix():
            block_pool=block_pool,
            kv_cache_spec=sliding_window_spec,
            use_eagle=False,
+            alignment_tokens=block_size,
        )[0]
        assert len(computed_blocks) == expect_length

@ -178,7 +184,7 @@ def test_chunked_local_attention_remove_skipped_blocks():
        attention_chunk_size=4,
    )

-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)

    manager = get_chunked_local_attention_manager(attention_spec, block_pool)

@ -239,7 +245,7 @@ def test_sliding_window_remove_skipped_blocks():
        sliding_window=4,
    )

-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)

    manager = get_sliding_window_manager(sliding_window_spec, block_pool)

@ -316,7 +322,9 @@ def test_get_num_blocks_to_allocate():
        sliding_window=4,  # Placeholder value, not related to test result
    )

-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
    cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
@ -341,7 +349,9 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
        attention_chunk_size=4,  # Placeholder value, not related to test result
    )

-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
    manager = get_chunked_local_attention_manager(attention_spec, block_pool)
    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
    cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@ -87,6 +87,11 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
    # Set small draft model len to force doesn't-fit-in-drafter case.
    spec_config_short = spec_config | {"max_model_len": 50}

+    test_sampling_params = [
+        dict(),
+        dict(logprobs=2),
+    ]
+
    # test_preemption, executor, async_scheduling,
    # spec_config, test_prefill_chunking
    test_configs = [
@ -103,7 +108,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
        (True, "uni", True, spec_config_short, True),
    ]

-    run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
+    run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)


@dynamo_config.patch(cache_size_limit=16)
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@ -11,6 +11,7 @@ import pprint
 import time
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
+from copy import deepcopy
 from functools import partial
 from typing import Any

@ -429,7 +430,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                self.vllm_backend.compiler_manager.compile(
                    submod,
                    args,
-                    self.compilation_config.inductor_compile_config,
+                    self.vllm_backend.inductor_config,
                    self.compilation_config,
                    graph_index=index,
                    num_graphs=len(self.compile_submod_names),
@ -531,6 +532,9 @@ class VllmBackend:
    sym_tensor_indices: list[int]
    input_buffers: list[torch.Tensor]
    compiler_manager: CompilerManager
+    # Copy of CompilationConfig.inductor_compile_config +
+    # an entry for PostGradPassManager
+    inductor_config: dict[str, Any]

    def __init__(
        self,
@ -561,25 +565,30 @@ class VllmBackend:
            self.compilation_config
        )

+        # Deepcopy the inductor config to detach the post-grad custom pass
+        # from CompilationConfig.
+        # We want to avoid PostGradPassManager in CompilationConfig because
+        # in future we need PostGradPassManager.uuid() to be executed
+        # only at compile time.
+        self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
        # `torch.compile` is JIT compiled, so we don't need to
        # do anything here

    def configure_post_pass(self):
-        config = self.compilation_config
        self.pass_manager.configure(self.vllm_config)

        # Post-grad custom passes are run using the post_grad_custom_post_pass
        # hook. If a pass for that hook exists, add it to the pass manager.
-        inductor_config = config.inductor_compile_config
-        if self.pass_key in inductor_config:
-            if isinstance(inductor_config[self.pass_key], PostGradPassManager):
-                # PassManager already added to config, make sure it's correct
-                assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
+        if self.pass_key in self.inductor_config:
+            if isinstance(self.inductor_config[self.pass_key], PostGradPassManager):
+                raise ValueError(
+                    "PostGradPassManager can not be kept in CompilationConfig."
+                )
            else:
                # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[self.pass_key], InductorPass)
-                self.pass_manager.add(inductor_config[self.pass_key])
-        inductor_config[self.pass_key] = self.pass_manager
+                assert isinstance(self.inductor_config[self.pass_key], InductorPass)
+                self.pass_manager.add(self.inductor_config[self.pass_key])
+        self.inductor_config[self.pass_key] = self.pass_manager

    def __call__(
        self, graph: fx.GraphModule, example_inputs
@ -638,9 +647,7 @@ class VllmBackend:
        self.compilation_config.local_cache_dir = local_cache_dir

        # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
-        disable_cache = not is_compile_cache_enabled(
-            self.compilation_config.inductor_compile_config
-        )
+        disable_cache = not is_compile_cache_enabled(self.inductor_config)

        if disable_cache:
            logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 import inspect
 import os
 import pickle
@ -14,6 +13,7 @@ import vllm.envs as envs
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash

 try:
    from torch._dynamo.aot_compile import SerializableCallable
@ -160,7 +160,7 @@ def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str:
            # e.g. exec(). We can't actually check these.
            continue
        hash_content.append(content)
-    return hashlib.md5(
+    return safe_hash(
        "\n".join(hash_content).encode(), usedforsecurity=False
    ).hexdigest()

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
-import hashlib
 import os
 from collections.abc import Callable
 from contextlib import ExitStack
@ -16,6 +15,7 @@ import torch.fx as fx
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.utils.hashing import safe_hash
 from vllm.utils.torch_utils import is_torch_equal_or_newer


@ -197,9 +197,9 @@ class InductorStandaloneAdaptor(CompilerInterface):

    def compute_hash(self, vllm_config: VllmConfig) -> str:
        factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
        return hash_str

    def initialize_cache(
@ -286,9 +286,9 @@ class InductorAdaptor(CompilerInterface):

    def compute_hash(self, vllm_config: VllmConfig) -> str:
        factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
        return hash_str

    def initialize_cache(
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@ -107,7 +107,7 @@ class PiecewiseBackend:
            entry.runnable = self.vllm_backend.compiler_manager.compile(
                self.graph,
                args,
-                self.compilation_config.inductor_compile_config,
+                self.vllm_backend.inductor_config,
                self.compilation_config,
                graph_index=self.piecewise_compile_index,
                num_graphs=self.total_piecewise_compiles,
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@ -144,7 +144,7 @@ class CacheConfig:

    kv_offloading_backend: KVOffloadingBackend | None = None
    """The backend to use for KV cache offloading. Supported backends include
-    'native' (vLLM native CPU offloading), 'lmcache' This option must be used 
+    'native' (vLLM native CPU offloading), 'lmcache' This option must be used
    together with kv_offloading_size."""

    def compute_hash(self) -> str:
@ -167,8 +167,6 @@ class CacheConfig:
            "num_gpu_blocks_override",
            "enable_prefix_caching",
            "prefix_caching_hash_algo",
-            # `cpu_offload_gb` does not use `torch.compile` yet.
-            "cpu_offload_gb",
            "cpu_kvcache_space_bytes",
            "mamba_page_size_padded",
            # Post-init/derived counters
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -264,7 +264,6 @@ class CompilationConfig:
        - [`cudagraph_copy_inputs`]
        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
    - Inductor compilation:
-        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
        - [`inductor_compile_config`]
        [vllm.config.CompilationConfig.inductor_compile_config]
@ -348,7 +347,7 @@ class CompilationConfig:
    - 'none,+op1,+op2' to enable only op1 and op2

    By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
    Inductor generates (fused) Triton kernels for disabled custom ops."""
    splitting_ops: list[str] | None = None
    """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@ -374,24 +373,6 @@ class CompilationConfig:
    Disabled by default until more models are supported/tested to work."""

    # Inductor capture
-    use_inductor: bool | None = None
-    """
-    Whether to use inductor compilation.
-
-    This flag is deprecated and will be removed in the next release 0.12.0.
-    Please use the 'backend' option instead.
-
-    - False: inductor compilation is not used. graph runs in eager
-        (custom_ops enabled by default).
-    - True: inductor compilation is used (custom_ops disabled by default).
-        One graph for symbolic shape and one graph per size in compile_sizes
-        are compiled using configurations in inductor_compile_config.
-
-    This setting is ignored if mode<VLLM_COMPILE.
-
-    For future compatibility:
-    If use_inductor is True, backend="inductor" otherwise backend="eager".
-    """
    compile_sizes: list[int | str] | None = None
    """Sizes to compile for inductor. In addition
    to integers, it also supports "cudagraph_capture_sizes" to
@ -759,14 +740,6 @@ class CompilationConfig:
                f"Invalid backend for piecewise compilation: {self.backend}"
            )

-        if self.use_inductor is not None:
-            logger.warning_once(
-                "The 'use_inductor' flag is deprecated and will be "
-                "removed in the next release (v0.12.0). "
-                "Please use the 'backend' option instead.",
-            )
-            self.backend = "inductor" if self.use_inductor else "eager"
-
        if self.backend == "":
            self.backend = current_platform.get_compile_backend()

--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from dataclasses import field
 from typing import Any, Literal

@ -10,6 +9,7 @@ from pydantic import ConfigDict, SkipValidation
 from pydantic.dataclasses import dataclass

 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash

 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]

@ -45,7 +45,7 @@ class DeviceConfig:
        # the device/platform information will be summarized
        # by torch/vllm automatically.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def __post_init__(self):
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 import uuid
 from dataclasses import field
 from typing import Any, Literal, get_args
@ -9,6 +8,7 @@ from typing import Any, Literal, get_args
 from pydantic.dataclasses import dataclass

 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash

 KVProducer = Literal["kv_producer", "kv_both"]
 KVConsumer = Literal["kv_consumer", "kv_both"]
@ -79,7 +79,7 @@ class KVTransferConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def __post_init__(self) -> None:
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from typing import TYPE_CHECKING, Any

 from pydantic import Field, field_validator
@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass

 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash

 if TYPE_CHECKING:
    from vllm.model_executor.model_loader import LoadFormats
@ -104,7 +104,7 @@ class LoadConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @field_validator("load_format", mode="after")
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from typing import TYPE_CHECKING, Any, Literal

 import torch
@ -11,6 +10,7 @@ from typing_extensions import Self

 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash

 if TYPE_CHECKING:
    from vllm.config import ModelConfig
@ -74,7 +74,7 @@ class LoRAConfig:
        factors.append(self.fully_sharded_loras)
        factors.append(self.lora_dtype)

-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @model_validator(mode="after")
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -39,7 +39,12 @@ from vllm.transformers_utils.gguf_utils import (
    maybe_patch_hf_config_from_gguf,
 )
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect
+from vllm.transformers_utils.utils import (
+    is_gguf,
+    is_remote_gguf,
+    maybe_model_redirect,
+    split_remote_gguf,
+)
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype

@ -294,9 +299,6 @@ class ModelConfig:
    pooler_config: PoolerConfig | None = None
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""
-    override_pooler_config: dict | PoolerConfig | None = None
-    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
-    v0.12.0 or v1.0.0, whichever is sooner."""

    # Multimodal config and init vars
    multimodal_config: MultiModalConfig | None = None
@ -343,7 +345,6 @@ class ModelConfig:
            "logprobs_mode",
            "disable_cascade_attn",
            "skip_tokenizer_init",
-            "enable_prompt_embeds",
            "served_model_name",
            "config_format",
            "hf_token",
@ -354,7 +355,6 @@ class ModelConfig:
            "logits_processors",
            "io_processor_plugin",
            "pooler_config",
-            "override_pooler_config",
            "multimodal_config",
            "limit_mm_per_prompt",
            "media_io_kwargs",
@ -440,7 +440,8 @@ class ModelConfig:
        self.model = maybe_model_redirect(self.model)
        # The tokenizer is consistent with the model by default.
        if self.tokenizer is None:
-            if check_gguf_file(self.model):
+            # Check if this is a GGUF model (either local file or remote GGUF)
+            if is_gguf(self.model):
                raise ValueError(
                    "Using a tokenizer is mandatory when loading a GGUF model. "
                    "Please specify the tokenizer path or name using the "
@ -642,18 +643,6 @@ class ModelConfig:

        # Init pooler config if needed
        if self.runner_type == "pooling":
-            if self.override_pooler_config is not None:
-                logger.warning_once(
-                    "`override_pooler_config` is deprecated and will be "
-                    "removed in v0.12.0 or v1.0.0, whichever is sooner. "
-                    "Please use `pooler_config` instead."
-                )
-
-                if isinstance(self.override_pooler_config, dict):
-                    self.pooler_config = PoolerConfig(**self.override_pooler_config)
-                else:
-                    self.pooler_config = self.override_pooler_config
-
            if self.pooler_config is None:
                self.pooler_config = PoolerConfig()

@ -832,7 +821,10 @@ class ModelConfig:
            self.tokenizer = object_storage_tokenizer.dir

    def _get_encoder_config(self):
-        return get_sentence_transformer_tokenizer_config(self.model, self.revision)
+        model = self.model
+        if is_remote_gguf(model):
+            model, _ = split_remote_gguf(model)
+        return get_sentence_transformer_tokenizer_config(model, self.revision)

    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from collections.abc import Mapping
 from typing import TYPE_CHECKING, Any, Literal, TypeAlias

@ -9,6 +8,7 @@ from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass

 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash

 if TYPE_CHECKING:
    from vllm.attention.backends.registry import AttentionBackendEnum
@ -216,7 +216,7 @@ class MultiModalConfig:
            if self.mm_encoder_attn_backend is not None
            else None
        ]
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    def get_limit_per_prompt(self, modality: str) -> int:
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from functools import cached_property
 from typing import Any, Literal, cast

@ -11,6 +10,7 @@ from pydantic.dataclasses import dataclass

 from vllm import version
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash

 DetailedTraceModules = Literal["model", "worker", "all"]

@ -78,7 +78,7 @@ class ObservabilityConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @field_validator("show_hidden_metrics_for_version")
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -593,9 +593,10 @@ class ParallelConfig:
                "max_parallel_loading_workers is currently "
                "not supported and will be ignored."
            )
-        if self.distributed_executor_backend != "mp" and self.nnodes > 1:
+        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
            raise ValueError(
-                "nnodes > 1 can only be set when distributed exectuor backend is mp."
+                "nnodes > 1 can only be set when distributed executor "
+                "backend is mp or uni."
            )

    @property
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from typing import Any

 from pydantic.dataclasses import dataclass

 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash

 logger = init_logger(__name__)

@ -102,7 +102,7 @@ class PoolerConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str


--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
@ -12,6 +11,7 @@ from typing_extensions import Self, deprecated

 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import resolve_obj_by_qualname

 if TYPE_CHECKING:
@ -178,7 +178,7 @@ class SchedulerConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import ast
-import hashlib
 from typing import TYPE_CHECKING, Any, Literal, get_args

 from pydantic import Field, SkipValidation, model_validator
@ -13,6 +12,7 @@ from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import LazyLoader, has_arctic_inference

 if TYPE_CHECKING:
@ -162,7 +162,7 @@ class SpeculativeConfig:
        # Eagle3 affects the computation graph because it returns intermediate
        # hidden states in addition to the final hidden state.
        factors.append(self.method == "eagle3")
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @staticmethod
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import hashlib
 from typing import Any, Literal

 from pydantic import model_validator
@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass
 from typing_extensions import Self

 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash

 StructuredOutputsBackend = Literal[
    "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
@ -58,7 +58,7 @@ class StructuredOutputsConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

    @model_validator(mode="after")
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -3,7 +3,6 @@

 import copy
 import getpass
-import hashlib
 import json
 import os
 import tempfile
@ -25,6 +24,7 @@ from vllm.config.speculative import EagleModelTypes
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
+from vllm.utils.hashing import safe_hash

 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
@ -193,7 +193,7 @@ class VllmConfig:
            vllm_factors.append("None")
        if self.additional_config:
            if isinstance(additional_config := self.additional_config, dict):
-                additional_config_hash = hashlib.md5(
+                additional_config_hash = safe_hash(
                    json.dumps(additional_config, sort_keys=True).encode(),
                    usedforsecurity=False,
                ).hexdigest()
@ -204,9 +204,9 @@ class VllmConfig:
            vllm_factors.append("None")
        factors.append(vllm_factors)

-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
        return hash_str

    def pad_for_cudagraph(self, batch_size: int) -> int:
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import hashlib
 import os
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
@ -15,6 +14,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
    KVConnectorRole,
 )
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput

@ -423,7 +423,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
        if mm_hashes:
            mm_str = "-".join(mm_hashes)
            token_bytes += mm_str.encode("utf-8")
-        input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest()
+        input_ids_hash = safe_hash(token_bytes, usedforsecurity=False).hexdigest()

        foldername = os.path.join(self._storage_path, input_ids_hash)
        if create_folder:
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -51,6 +51,7 @@ from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.network_utils import get_distributed_init_method
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import (
    direct_register_custom_op,
    supports_custom_op,
@ -329,7 +330,8 @@ class GroupCoordinator:
            )
            # a group with `gloo` backend, to allow direct coordination between
            # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            with suppress_stdout():
+                cpu_group = torch.distributed.new_group(ranks, backend="gloo")
            if self.rank in ranks:
                self.ranks = ranks
                self.world_size = len(ranks)
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@ -30,6 +30,7 @@ from torch.distributed.rendezvous import rendezvous
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_tcp_uri
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import is_torch_equal_or_newer

 logger = init_logger(__name__)
@ -427,33 +428,34 @@ def init_gloo_process_group(
    Stateless init ProcessGroup with gloo backend compatible with
    different torch versions.
    """
-    if is_torch_equal_or_newer("2.6"):
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-    else:
-        options = ProcessGroup.Options(backend="gloo")
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-            options,
-        )
-    from torch.distributed.distributed_c10d import ProcessGroupGloo
+    with suppress_stdout():
+        if is_torch_equal_or_newer("2.6"):
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+            )
+        else:
+            options = ProcessGroup.Options(backend="gloo")
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+                options,
+            )
+        from torch.distributed.distributed_c10d import ProcessGroupGloo

-    backend_class = ProcessGroupGloo(
-        prefix_store, group_rank, group_size, timeout=timeout
-    )
-    backend_type = ProcessGroup.BackendType.GLOO
-    device = torch.device("cpu")
-    if is_torch_equal_or_newer("2.6"):
-        # _set_default_backend is supported in torch >= 2.6
-        pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
+        backend_class = ProcessGroupGloo(
+            prefix_store, group_rank, group_size, timeout=timeout
+        )
+        backend_type = ProcessGroup.BackendType.GLOO
+        device = torch.device("cpu")
+        if is_torch_equal_or_newer("2.6"):
+            # _set_default_backend is supported in torch >= 2.6
+            pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()

-    pg._register_backend(device, backend_type, backend_class)
+        pg._register_backend(device, backend_type, backend_class)
    return pg


--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -29,7 +29,7 @@ import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
 from pydantic.fields import FieldInfo
-from typing_extensions import TypeIs, deprecated
+from typing_extensions import TypeIs

 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
@ -86,7 +86,7 @@ from vllm.transformers_utils.config import (
    is_interleaved,
    maybe_override_with_speculators,
 )
-from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
+from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
@ -520,9 +520,6 @@ class EngineArgs:
    scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls

    pooler_config: PoolerConfig | None = ModelConfig.pooler_config
-    override_pooler_config: dict | PoolerConfig | None = (
-        ModelConfig.override_pooler_config
-    )
    compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
    worker_cls: str = ParallelConfig.worker_cls
    worker_extension_cls: str = ParallelConfig.worker_extension_cls
@ -659,11 +656,6 @@ class EngineArgs:
        )
        model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
        model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
-        model_group.add_argument(
-            "--override-pooler-config",
-            **model_kwargs["override_pooler_config"],
-            deprecated=True,
-        )
        model_group.add_argument(
            "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
        )
@ -880,7 +872,11 @@ class EngineArgs:
            "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
        )
        cache_group.add_argument(
-            "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]
+            "--enable-prefix-caching",
+            **{
+                **cache_kwargs["enable_prefix_caching"],
+                "default": None,
+            },
        )
        cache_group.add_argument(
            "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
@ -1144,8 +1140,8 @@ class EngineArgs:
        return engine_args

    def create_model_config(self) -> ModelConfig:
-        # gguf file needs a specific model loader and doesn't use hf_repo
-        if check_gguf_file(self.model):
+        # gguf file needs a specific model loader
+        if is_gguf(self.model):
            self.quantization = self.load_format = "gguf"

        # NOTE(woosuk): In V1, we use separate processes for workers (unless
@ -1239,7 +1235,6 @@ class EngineArgs:
            mm_encoder_tp_mode=self.mm_encoder_tp_mode,
            mm_encoder_attn_backend=self.mm_encoder_attn_backend,
            pooler_config=self.pooler_config,
-            override_pooler_config=self.override_pooler_config,
            logits_processor_pattern=self.logits_processor_pattern,
            generation_config=self.generation_config,
            override_generation_config=self.override_generation_config,
@ -1812,9 +1807,11 @@ class EngineArgs:
        if model_config.runner_type != "pooling":
            default_chunked_prefill = True

-            # Disable prefix caching default for hybrid models
-            # since the feature is still experimental.
-            default_prefix_caching = not model_config.is_hybrid
+            # Disable prefix caching default for hybrid models and mamba-only
+            # models since the feature is still experimental.
+            default_prefix_caching = not (
+                model_config.is_hybrid or model_config.is_attention_free
+            )
        else:
            assert model_config.pooler_config is not None

@ -2041,24 +2038,6 @@ class AsyncEngineArgs(EngineArgs):

    enable_log_requests: bool = False

-    @property
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self) -> bool:
-        return not self.enable_log_requests
-
-    @disable_log_requests.setter
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self, value: bool):
-        self.enable_log_requests = not value
-
    @staticmethod
    def add_cli_args(
        parser: FlexibleArgumentParser, async_args_only: bool = False
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -174,9 +174,6 @@ class LLM:
            For example, for Phi-3-Vision: `{"num_crops": 4}`.
        pooler_config: Initialize non-default pooling config for the pooling
            model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
-        override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
-            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
-            whichever is sooner.
        compilation_config: Either an integer or a dictionary. If it is an
            integer, it is used as the mode of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
@ -214,7 +211,6 @@ class LLM:
        hf_overrides: HfOverrides | None = None,
        mm_processor_kwargs: dict[str, Any] | None = None,
        pooler_config: PoolerConfig | None = None,
-        override_pooler_config: PoolerConfig | None = None,
        structured_outputs_config: dict[str, Any]
        | StructuredOutputsConfig
        | None = None,
@ -330,7 +326,6 @@ class LLM:
            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
            pooler_config=pooler_config,
-            override_pooler_config=override_pooler_config,
            structured_outputs_config=structured_outputs_instance,
            compilation_config=compilation_config_instance,
            logits_processors=logits_processors,
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -29,7 +29,6 @@ from openai.types.responses import (
    ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent,
    ResponsePrompt,
-    ResponseReasoningItem,
    ResponseReasoningTextDeltaEvent,
    ResponseReasoningTextDoneEvent,
    ResponseStatus,
@ -304,9 +303,7 @@ def get_logits_processors(
    return None


-ResponseInputOutputItem: TypeAlias = (
-    ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
-)
+ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem


 class ResponsesRequest(OpenAIBaseModel):
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -10,6 +10,9 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
    Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@ -94,6 +97,12 @@ def construct_chat_message_with_tool_call(
            "role": "assistant",
            "reasoning": reasoning_content,
        }
+    elif isinstance(item, ResponseFunctionToolCallOutputItem):
+        return ChatCompletionToolMessageParam(
+            role="tool",
+            content=item.output,
+            tool_call_id=item.call_id,
+        )
    elif item.get("type") == "function_call_output":
        # Append the function call output as a tool message.
        return ChatCompletionToolMessageParam(
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+
 import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@ -12,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
    TopKWeightAndReduceNoOP,
 )
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.import_utils import has_triton_kernels

@ -88,14 +90,17 @@ def triton_kernel_moe_forward(
        gating_output, topk, sm_first=not renormalize
    )

+    output = torch.empty_like(hidden_states)
+
    return triton_kernel_fused_experts(
-        None,
+        output,
        hidden_states,
        w1,
        w2,
        routing_data,
        gather_idx,
        scatter_idx,
+        topk=topk,
        activation=activation,
        quant_config=quant_config,
        apply_router_weight_on_input=apply_router_weight_on_input,
@ -113,6 +118,7 @@ def triton_kernel_fused_experts(
    routing_data,  # RoutingData
    gather_indx,  # GatherIndx
    scatter_indx,  # ScatterIndx
+    topk: int,
    activation: str = "silu",
    quant_config: FusedMoEQuantConfig | None = None,
    swiglu_alpha: float = 1.702,
@ -120,6 +126,7 @@ def triton_kernel_fused_experts(
    apply_router_weight_on_input: bool = False,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
+    intermediate_cache: torch.Tensor | None = None,
    a1q_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
    if quant_config is None:
@ -131,14 +138,30 @@ def triton_kernel_fused_experts(
    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32

    # Shape check, only check non-mxfp4
+    assert hidden_states.ndim == 2
    assert hidden_states.shape[-1] == w1.shape[-2]
    assert w2.shape[-1] == w1.shape[1]

+    batch_dim = 1
+    M, K = hidden_states.shape[-2:]
    E, _, N = w1.shape

    if global_num_experts == -1:
        global_num_experts = E

+    if intermediate_cache is None:
+        intermediate_cache = torch.empty(
+            (batch_dim, M * topk, N // 2),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+
+    # Add batch_dim to output buffer because matmul_ogs expects 3D output
+    intermediate_cache = _resize_cache(
+        intermediate_cache, (batch_dim, M * topk, N // 2)
+    )
+    output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
+
    act = FusedActivation(
        FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
        (swiglu_alpha, swiglu_limit),
@ -146,7 +169,7 @@ def triton_kernel_fused_experts(
    )
    gammas = routing_data.gate_scal if routing_data else None

-    intermediate_cache1 = matmul_ogs(
+    matmul_ogs(
        hidden_states,
        w1,
        quant_config.w1_bias,
@ -155,10 +178,11 @@ def triton_kernel_fused_experts(
        precision_config=quant_config.w1_precision,
        gammas=gammas if apply_router_weight_on_input else None,
        fused_activation=act,
+        y=intermediate_cache,
    )

-    intermediate_cache3 = matmul_ogs(
-        intermediate_cache1,
+    matmul_ogs(
+        intermediate_cache.view(M * topk, N // 2),
        w2,
        quant_config.w2_bias,
        routing_data,
@ -167,7 +191,8 @@ def triton_kernel_fused_experts(
        gammas=None if apply_router_weight_on_input else gammas,
        y=output_tensor,
    )
-    return intermediate_cache3
+    output_tensor = output_tensor.view(M, K)
+    return output_tensor


 def make_routing_data(
@ -221,6 +246,42 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
    def supports_expert_map(self) -> bool:
        return True

+    def moe_problem_size(
+        self,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> tuple[int, int, int, int, int]:
+        """
+        Extract the MoE problem size from the given tensor arguments:
+        - a: The hidden states, input to the MoE layer.
+        - w1: The first set of expert weights.
+        - w2: The second set of expert weights.
+        - topk_ids: The topk ids.
+        Note: extracting the problem shape from the weight and activation
+        tensors is not obvious.  It needs to be done this way specifically
+        due to subtle issues with particular kernels, e.g. the int4 kernels
+        divide the trailing dimension by two, so it's not "correct" to
+        extract N or K from the trailing dimension of w1 or w2.  Similarly,
+        some kernels transpose the weights, so this needs to be kept in mind.
+        Note: This implementation covers most cases. However, if experts
+        require a specialized implementation, like MarlinExperts, they are free
+        to override this function.
+        """
+        assert w1.dim() == 3 and w2.dim() == 3
+        E, _, N = w1.size()
+        K = a1.size(-1)
+
+        assert a1.dim() == 2
+        assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
+        M = a1.size(0)
+
+        assert topk_ids.dim() == 2
+        topk = topk_ids.size(1)
+
+        return E, M, N, K, topk
+
    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Weight application and reduction happens in the fused_experts kernel.
        return TopKWeightAndReduceNoOP()
@ -263,8 +324,8 @@ class OAITritonExperts(BaseOAITritonExperts):
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        # workspace are allocated inside the kernel
-        workspace1 = (M, K)
-        workspace2 = (0, 0)
+        workspace1 = (0, 0)
+        workspace2 = (M * topk, N // 2)
        output = (M, K)
        return (workspace1, workspace2, output)

@ -297,20 +358,21 @@ class OAITritonExperts(BaseOAITritonExperts):
            topk_ids, topk_weights, local_num_experts
        )

-        experts_output = triton_kernel_fused_experts(
-            None,
+        topk = topk_ids.size(1)
+        triton_kernel_fused_experts(
+            output,
            hidden_states,
            w1,
            w2,
            routing_data,
            gather_indx,
            scatter_indx,
+            topk=topk,
            activation=activation,
            quant_config=self.quant_config,
            apply_router_weight_on_input=False,
            global_num_experts=local_num_experts,
            expert_map=None,  # applied already
+            intermediate_cache=workspace2,
            a1q_scale=a1q_scale,
        )
-
-        output.copy_(experts_output, non_blocking=True)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@ -103,7 +103,7 @@ __all__ = [
    "CompressedTensorsW8A8Int8MoEMethod",
    "CompressedTensorsWNA16MarlinMoEMethod",
    "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4MoeMethod",
+    "CompressedTensorsW4A4Nvfp4MoeMethod",
    "CompressedTensorsW4A8Int8MoEMethod",
 ]

@ -171,7 +171,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                    quant_config, layer.moe_config
                )
        elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4MoeMethod(layer.moe_config)
+            return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
        elif (
            quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
            or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
@ -188,7 +188,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
            )


-class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
+class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
    def __init__(self, moe: FusedMoEConfig):
        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
            detect_nvfp4_moe_support,
@ -205,8 +205,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
            logger.info_once(
                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for CompressedTensorsW4A4MoeMethod."
+                " for CompressedTensorsW4A4Nvfp4MoeMethod."
            )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
+        else:
+            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")

    def create_weights(
        self,
@ -612,7 +616,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            assert expert_map is None, (
                "Expert Parallelism / expert_map "
                "is currently not supported for "
-                "CompressedTensorsW4A4MoeMethod."
+                "CompressedTensorsW4A4Nvfp4MoeMethod."
            )
            assert self.moe_quant_config is not None

--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@ -1132,6 +1132,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
                " for ModelOptNvFp4FusedMoE."
            )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
+        else:
+            logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")

    def maybe_make_prepare_finalize(
        self,
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@ -196,9 +196,10 @@ class Mxfp4Config(QuantizationConfig):
            # TODO: Add support for MXFP4 Linear Method.
            # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
            # if you are interested in enabling MXFP4 here.
-            logger.warning_once(
+            logger.debug_once(
                "MXFP4 linear layer is not implemented - falling back to "
-                "UnquantizedLinearMethod."
+                "UnquantizedLinearMethod.",
+                scope="local",
            )
            return UnquantizedLinearMethod()
        elif isinstance(layer, FusedMoE):
@ -208,9 +209,10 @@ class Mxfp4Config(QuantizationConfig):
                return Mxfp4MoEMethod(layer.moe_config)
        elif isinstance(layer, Attention):
            # TODO: Add support for MXFP4 Attention.
-            logger.warning_once(
+            logger.debug_once(
                "MXFP4 attention layer is not implemented. "
-                "Skipping quantization for this layer."
+                "Skipping quantization for this layer.",
+                scope="local",
            )
        return None

--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@ -18,6 +18,7 @@ from vllm.model_executor.model_loader.utils import (
    process_weights_after_loading,
 )
 from vllm.model_executor.model_loader.weight_utils import (
+    download_gguf,
    get_gguf_extra_tensor_names,
    get_gguf_weight_type_map,
    gguf_quant_weights_iterator,
@ -43,7 +44,8 @@ class GGUFModelLoader(BaseModelLoader):
                f"load format {load_config.load_format}"
            )

-    def _prepare_weights(self, model_name_or_path: str):
+    def _prepare_weights(self, model_config: ModelConfig):
+        model_name_or_path = model_config.model
        if os.path.isfile(model_name_or_path):
            return model_name_or_path
        # for raw HTTPS link
@ -55,12 +57,23 @@ class GGUFModelLoader(BaseModelLoader):
        if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
            repo_id, filename = model_name_or_path.rsplit("/", 1)
            return hf_hub_download(repo_id=repo_id, filename=filename)
-        else:
-            raise ValueError(
-                f"Unrecognised GGUF reference: {model_name_or_path} "
-                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
+        # repo_id:quant_type
+        elif "/" in model_name_or_path and ":" in model_name_or_path:
+            repo_id, quant_type = model_name_or_path.rsplit(":", 1)
+            return download_gguf(
+                repo_id,
+                quant_type,
+                cache_dir=self.load_config.download_dir,
+                revision=model_config.revision,
+                ignore_patterns=self.load_config.ignore_patterns,
            )

+        raise ValueError(
+            f"Unrecognised GGUF reference: {model_name_or_path} "
+            "(expected local file, raw URL, <repo_id>/<filename>.gguf, "
+            "or <repo_id>:<quant_type>)"
+        )
+
    def _get_gguf_weights_map(self, model_config: ModelConfig):
        """
        GGUF uses this naming convention for their tensors from HF checkpoint:
@ -244,7 +257,7 @@ class GGUFModelLoader(BaseModelLoader):
        gguf_to_hf_name_map: dict[str, str],
    ) -> dict[str, str]:
        weight_type_map = get_gguf_weight_type_map(
-            model_config.model, gguf_to_hf_name_map
+            model_name_or_path, gguf_to_hf_name_map
        )
        is_multimodal = hasattr(model_config.hf_config, "vision_config")
        if is_multimodal:
@ -290,10 +303,10 @@ class GGUFModelLoader(BaseModelLoader):
        yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)

    def download_model(self, model_config: ModelConfig) -> None:
-        self._prepare_weights(model_config.model)
+        self._prepare_weights(model_config)

    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
        gguf_weights_map = self._get_gguf_weights_map(model_config)
        model.load_weights(
            self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
@ -303,7 +316,7 @@ class GGUFModelLoader(BaseModelLoader):
        self, vllm_config: VllmConfig, model_config: ModelConfig
    ) -> nn.Module:
        device_config = vllm_config.device_config
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
        gguf_weights_map = self._get_gguf_weights_map(model_config)
        # we can only know if tie word embeddings after mapping weights
        if "lm_head.weight" in get_gguf_extra_tensor_names(
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -369,6 +369,52 @@ def get_sparse_attention_config(
    return config


+def download_gguf(
+    repo_id: str,
+    quant_type: str,
+    cache_dir: str | None = None,
+    revision: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+) -> str:
+    # Use patterns that snapshot_download can handle directly
+    # Patterns to match:
+    # - *-{quant_type}.gguf (root)
+    # - *-{quant_type}-*.gguf (root sharded)
+    # - */*-{quant_type}.gguf (subdir)
+    # - */*-{quant_type}-*.gguf (subdir sharded)
+    allow_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+
+    # Use download_weights_from_hf which handles caching and downloading
+    folder = download_weights_from_hf(
+        model_name_or_path=repo_id,
+        cache_dir=cache_dir,
+        allow_patterns=allow_patterns,
+        revision=revision,
+        ignore_patterns=ignore_patterns,
+    )
+
+    # Find the downloaded file(s) in the folder
+    local_files = []
+    for pattern in allow_patterns:
+        # Convert pattern to glob pattern for local filesystem
+        glob_pattern = os.path.join(folder, pattern)
+        local_files.extend(glob.glob(glob_pattern))
+
+    if not local_files:
+        raise ValueError(
+            f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}"
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    local_files.sort(key=lambda x: (x.count("-"), x))
+    return local_files[0]
+
+
 def download_weights_from_hf(
    model_name_or_path: str,
    cache_dir: str | None,
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@ -233,7 +233,7 @@ class BaiChuanDecoderLayer(nn.Module):
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            position_embedding=position_embedding,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            max_position_embeddings=max_position_embeddings,
            cache_config=cache_config,
            quant_config=quant_config,
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@ -289,9 +289,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
        model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config

-        if cache_config.mamba_block_size is None:
-            cache_config.mamba_block_size = model_config.max_model_len
-
        if cache_config.enable_prefix_caching:
            if model_config.supports_mamba_prefix_caching:
                logger.info(
@ -299,6 +296,11 @@ class MambaModelConfig(VerifyAndUpdateConfig):
                    "Its support for Mamba layers is experimental. "
                    "Please report any issues you may observe."
                )
+                # By default, mamba block size will be set to max_model_len (see
+                # below). When enabling prefix caching, we align mamba block size
+                # to the block size as the basic granularity for prefix caching.
+                if cache_config.mamba_block_size is None:
+                    cache_config.mamba_block_size = cache_config.block_size
            else:
                logger.info(
                    "Hybrid or mamba-based model detected without "
@ -306,6 +308,9 @@ class MambaModelConfig(VerifyAndUpdateConfig):
                )
                cache_config.enable_prefix_caching = False

+        if cache_config.mamba_block_size is None:
+            cache_config.mamba_block_size = model_config.max_model_len
+
        # TODO(tdoublep): remove once cascade attention is supported
        logger.info(
            "Disabling cascade attention since it is not supported for hybrid models."
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@ -100,7 +100,7 @@ class GPTJAttention(nn.Module):
            self.head_size,
            rotary_dim=config.rotary_dim,
            max_position=max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            is_neox_style=False,
        )
        self.attn = Attention(
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@ -239,7 +239,7 @@ class Grok1DecoderLayer(nn.Module):
            num_heads=config.num_attention_heads,
            max_position=config.max_position_embeddings,
            num_kv_heads=config.num_key_value_heads,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@ -262,7 +262,7 @@ class LlamaAttention(nn.Module):
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=self.max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            is_neox_style=is_neox_style,
            partial_rotary_factor=self.partial_rotary_factor,
        )
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -5,7 +5,6 @@ Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
 """

-import hashlib
 import importlib
 import json
 import os
@ -32,6 +31,7 @@ from vllm.config import (
 from vllm.logger import init_logger
 from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
+from vllm.utils.hashing import safe_hash

 from .interfaces import (
    has_inner_state,
@ -655,7 +655,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel):

        if model_path.exists():
            with open(model_path, "rb") as f:
-                module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
+                module_hash = safe_hash(f.read(), usedforsecurity=False).hexdigest()

            mi = self._load_modelinfo_from_cache(module_hash)
            if mi is not None:
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -407,9 +407,6 @@ class CudaPlatformBase(Platform):

        # We have found some valid backends. Select the one with the
        # highest priority.
-        logger.info(
-            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
-        )
        sorted_indices = sorted(
            range(len(valid_backends_priorities)),
            key=lambda i: valid_backends_priorities[i][1],
@ -417,8 +414,9 @@ class CudaPlatformBase(Platform):
        selected_index = sorted_indices[0]
        selected_backend = valid_backends_priorities[selected_index][0]
        logger.info(
-            "Using %s backend.",
+            "Using %s attention backend out of potential backends: %s",
            selected_backend.name,
+            [b[0].name for b in valid_backends_priorities],
        )

        return selected_backend.get_path()
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -42,7 +42,10 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
 from vllm.transformers_utils.utils import (
    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
    parse_safetensors_file_metadata,
+    split_remote_gguf,
 )

 if envs.VLLM_USE_MODELSCOPE:
@ -453,51 +456,55 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No

 def patch_rope_parameters(config: PretrainedConfig) -> None:
    """Provide backwards compatibility for RoPE."""
-    # Retrieve rope_parameters differently based on Transformers version
+    # Patch rope_parameters differently based on Transformers version
    if Version(version("transformers")) >= Version("5.0.0.dev0"):
-        from transformers.modeling_rope_utils import RopeParameters
-
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
-            config, "rope_parameters", None
+        from transformers.modeling_rope_utils import (
+            rope_config_validation,
+            standardize_rope_params,
        )
-    elif hasattr(config, "rope_parameters"):
-        # We are in Transformers v4 and rope_parameters
-        # has already been patched for this config
-        return
+
+        # When Transformers v5 is installed, legacy rope_theta may be present
+        # when using custom code models written for Transformers v4
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+            standardize_rope_params(config, rope_theta=rope_theta)
+            rope_config_validation(config)
+            # Delete rope_theta to avoid confusion in downstream code
+            del config.rope_theta
    else:
-        # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
-        rope_theta: float | None = getattr(config, "rope_theta", None)
-        rope_scaling: dict | None = getattr(config, "rope_scaling", None)
-        rope_parameters = rope_scaling
-        # Move rope_theta into rope_parameters
-        if rope_theta is not None:
-            rope_parameters = rope_parameters or {"rope_type": "default"}
-            rope_parameters["rope_theta"] = rope_theta
-        # Add original_max_position_embeddings if present
-        if rope_parameters and (
-            ompe := getattr(config, "original_max_position_embeddings", None)
-        ):
-            rope_parameters["original_max_position_embeddings"] = ompe
-        # Write back to config
-        config.rope_parameters = rope_parameters
+        # When Transformers v4 is installed, legacy rope_scaling may be present
+        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
+            config.rope_parameters = rope_scaling
+        # When Transformers v4 is installed, legacy rope_theta may be present
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+            if not hasattr(config, "rope_parameters"):
+                config.rope_parameters = {"rope_type": "default"}
+            config.rope_parameters["rope_theta"] = rope_theta

    # No RoPE parameters to patch
-    if rope_parameters is None:
+    if not hasattr(config, "rope_parameters"):
        return

+    # Add original_max_position_embeddings if present
+    if ompe := getattr(config, "original_max_position_embeddings", None):
+        config.rope_parameters["original_max_position_embeddings"] = ompe
+
    # Handle nested rope_parameters in interleaved sliding attention models
-    if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
-        for rope_parameters_layer_type in rope_parameters.values():
+    if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        for rope_parameters_layer_type in config.rope_parameters.values():
            patch_rope_parameters_dict(rope_parameters_layer_type)
    else:
-        patch_rope_parameters_dict(rope_parameters)
+        patch_rope_parameters_dict(config.rope_parameters)


 def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
    if "rope_type" in rope_parameters and "type" in rope_parameters:
        rope_type = rope_parameters["rope_type"]
        rope_type_legacy = rope_parameters["type"]
-        if rope_type != rope_type_legacy:
+        if (rope_type_legacy == "su" and rope_type == "longrope") or (
+            rope_type_legacy == "mrope" and rope_type == "default"
+        ):
+            pass  # No action needed
+        elif rope_type != rope_type_legacy:
            raise ValueError(
                f"Found conflicts between 'rope_type={rope_type}' (modern "
                f"field) and 'type={rope_type_legacy}' (legacy field). "
@ -629,10 +636,12 @@ def maybe_override_with_speculators(
    Returns:
        Tuple of (resolved_model, resolved_tokenizer, speculative_config)
    """
-    is_gguf = check_gguf_file(model)
-    if is_gguf:
+    if check_gguf_file(model):
        kwargs["gguf_file"] = Path(model).name
        gguf_model_repo = Path(model).parent
+    elif is_remote_gguf(model):
+        repo_id, _ = split_remote_gguf(model)
+        gguf_model_repo = Path(repo_id)
    else:
        gguf_model_repo = None
    kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
@ -678,10 +687,18 @@ def get_config(
 ) -> PretrainedConfig:
    # Separate model folder from file path for GGUF models

-    is_gguf = check_gguf_file(model)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(model).name
-        model = Path(model).parent
+    _is_gguf = is_gguf(model)
+    _is_remote_gguf = is_remote_gguf(model)
+    if _is_gguf:
+        if check_gguf_file(model):
+            # Local GGUF file
+            kwargs["gguf_file"] = Path(model).name
+            model = Path(model).parent
+        elif _is_remote_gguf:
+            # Remote GGUF - extract repo_id from repo_id:quant_type format
+            # The actual GGUF file will be downloaded later by GGUFModelLoader
+            # Keep model as repo_id:quant_type for download, but use repo_id for config
+            model, _ = split_remote_gguf(model)

    if config_format == "auto":
        try:
@ -689,10 +706,25 @@ def get_config(
            # Transformers implementation.
            if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                config_format = "mistral"
-            elif is_gguf or file_or_path_exists(
+            elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists(
                model, HF_CONFIG_NAME, revision=revision
            ):
                config_format = "hf"
+            # Remote GGUF models must have config.json in repo,
+            # otherwise the config can't be parsed correctly.
+            # FIXME(Isotr0py): Support remote GGUF repos without config.json
+            elif _is_remote_gguf and not file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision
+            ):
+                err_msg = (
+                    "Could not find config.json for remote GGUF model repo. "
+                    "To load remote GGUF model through `<repo_id>:<quant_type>`, "
+                    "ensure your model has config.json (HF format) file. "
+                    "Otherwise please specify --hf-config-path <original_repo> "
+                    "in engine args to fetch config from unquantized hf model."
+                )
+                logger.error(err_msg)
+                raise ValueError(err_msg)
            else:
                raise ValueError(
                    "Could not detect config format for no config file found. "
@ -713,9 +745,6 @@ def get_config(
                "'config.json'.\n"
                "   - For Mistral models: ensure the presence of a "
                "'params.json'.\n"
-                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
-                "   Loading GGUF from a remote repo directly is not yet "
-                "supported.\n"
            ).format(model=model)

            raise ValueError(error_message) from e
@ -729,7 +758,7 @@ def get_config(
        **kwargs,
    )
    # Special architecture mapping check for GGUF models
-    if is_gguf:
+    if _is_gguf:
        if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
            raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
@ -889,6 +918,8 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None
        A dictionary containing the pooling type and whether
            normalization is used, or None if no pooling configuration is found.
    """
+    if is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)

    modules_file_name = "modules.json"

@ -1108,6 +1139,8 @@ def get_hf_image_processor_config(
    # Separate model folder from file path for GGUF models
    if check_gguf_file(model):
        model = Path(model).parent
+    elif is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)
    return get_image_processor_config(
        model, token=hf_token, revision=revision, **kwargs
    )
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@ -18,7 +18,7 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar

-from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path
+from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides

 if TYPE_CHECKING:
@ -236,8 +236,8 @@ def cached_processor_from_config(
    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
    **kwargs: Any,
 ) -> _P:
-    if check_gguf_file(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
            "For multimodal GGUF models, the original tokenizer "
            "should be used to correctly load processor."
        )
@ -350,8 +350,8 @@ def cached_image_processor_from_config(
    model_config: "ModelConfig",
    **kwargs: Any,
 ):
-    if check_gguf_file(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
            "For multimodal GGUF models, the original tokenizer "
            "should be used to correctly load image processor."
        )
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -20,7 +20,12 @@ from vllm.transformers_utils.config import (
    list_filtered_repo_files,
 )
 from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import check_gguf_file
+from vllm.transformers_utils.utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)

 if TYPE_CHECKING:
    from vllm.config import ModelConfig
@ -180,10 +185,12 @@ def get_tokenizer(
        kwargs["truncation_side"] = "left"

    # Separate model folder from file path for GGUF models
-    is_gguf = check_gguf_file(tokenizer_name)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(tokenizer_name).name
-        tokenizer_name = Path(tokenizer_name).parent
+    if is_gguf(tokenizer_name):
+        if check_gguf_file(tokenizer_name):
+            kwargs["gguf_file"] = Path(tokenizer_name).name
+            tokenizer_name = Path(tokenizer_name).parent
+        elif is_remote_gguf(tokenizer_name):
+            tokenizer_name, _ = split_remote_gguf(tokenizer_name)

    # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
    # first to use official Mistral tokenizer if possible.
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@ -9,6 +9,8 @@ from os import PathLike
 from pathlib import Path
 from typing import Any

+from gguf import GGMLQuantizationType
+
 import vllm.envs as envs
 from vllm.logger import init_logger

@ -46,6 +48,57 @@ def check_gguf_file(model: str | PathLike) -> bool:
        return False


+@cache
+def is_remote_gguf(model: str | Path) -> bool:
+    """Check if the model is a remote GGUF model."""
+    model = str(model)
+    return (
+        (not is_cloud_storage(model))
+        and (not model.startswith(("http://", "https://")))
+        and ("/" in model and ":" in model)
+        and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
+    )
+
+
+def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
+    """Check if the quant type is a valid GGUF quant type."""
+    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
+
+
+def split_remote_gguf(model: str | Path) -> tuple[str, str]:
+    """Split the model into repo_id and quant type."""
+    model = str(model)
+    if is_remote_gguf(model):
+        parts = model.rsplit(":", 1)
+        return (parts[0], parts[1])
+    raise ValueError(
+        "Wrong GGUF model or invalid GGUF quant type: %s.\n"
+        "- It should be in repo_id:quant_type format.\n"
+        "- Valid GGMLQuantizationType values: %s",
+        model,
+        GGMLQuantizationType._member_names_,
+    )
+
+
+def is_gguf(model: str | Path) -> bool:
+    """Check if the model is a GGUF model.
+
+    Args:
+        model: Model name, path, or Path object to check.
+
+    Returns:
+        True if the model is a GGUF model, False otherwise.
+    """
+    model = str(model)
+
+    # Check if it's a local GGUF file
+    if check_gguf_file(model):
+        return True
+
+    # Check if it's a remote GGUF model (repo_id:quant_type format)
+    return is_remote_gguf(model)
+
+
 def modelscope_list_repo_files(
    repo_id: str,
    revision: str | None = None,
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@ -73,14 +73,6 @@ class FlexibleArgumentParser(ArgumentParser):
        # Enable the deprecated kwarg for Python 3.12 and below

        def parse_known_args(self, args=None, namespace=None):
-            if args is not None and "--disable-log-requests" in args:
-                # Special case warning because the warning below won't trigger
-                # if –-disable-log-requests because its value is default.
-                logger.warning_once(
-                    "argument '--disable-log-requests' is deprecated and "
-                    "replaced with '--enable-log-requests'. This will be "
-                    "removed in v0.12.0."
-                )
            namespace, args = super().parse_known_args(args, namespace)
            for action in FlexibleArgumentParser._deprecated:
                if (
--- a/vllm/utils/hashing.py
+++ b/vllm/utils/hashing.py
@ -5,6 +5,7 @@ from __future__ import annotations

 import hashlib
 import pickle
+from _hashlib import HASH, UnsupportedDigestmodError
 from collections.abc import Callable
 from typing import Any

@ -61,3 +62,20 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
        return sha256_cbor

    raise ValueError(f"Unsupported hash function: {hash_fn_name}")
+
+
+def safe_hash(data: bytes, usedforsecurity: bool = True) -> HASH:
+    """Hash for configs, defaulting to md5 but falling back to sha256
+    in FIPS constrained environments.
+
+    Args:
+        data: bytes
+        usedforsecurity: Whether the hash is used for security purposes
+
+    Returns:
+        Hash object
+    """
+    try:
+        return hashlib.md5(data, usedforsecurity=usedforsecurity)
+    except (UnsupportedDigestmodError, ValueError):
+        return hashlib.sha256(data)
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]:
            os.environ[key] = old


+@contextlib.contextmanager
+def suppress_stdout():
+    """
+    Suppress stdout from C libraries at the file descriptor level.
+
+    Only suppresses stdout, not stderr, to preserve error messages.
+    Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG.
+
+    Example:
+        with suppress_stdout():
+            # C library calls that would normally print to stdout
+            torch.distributed.new_group(ranks, backend="gloo")
+    """
+    # Don't suppress if logging level is DEBUG
+    if envs.VLLM_LOGGING_LEVEL == "DEBUG":
+        yield
+        return
+
+    stdout_fd = sys.stdout.fileno()
+    stdout_dup = os.dup(stdout_fd)
+    devnull_fd = os.open(os.devnull, os.O_WRONLY)
+
+    try:
+        sys.stdout.flush()
+        os.dup2(devnull_fd, stdout_fd)
+        yield
+    finally:
+        sys.stdout.flush()
+        os.dup2(stdout_dup, stdout_fd)
+        os.close(stdout_dup)
+        os.close(devnull_fd)
+
+
 # File path utilities


--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@ -13,6 +13,8 @@ from vllm.distributed.kv_events import (
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (
    BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
    BlockHashWithGroupId,
    ExternalBlockHash,
    FreeKVCacheBlockQueue,
@ -133,6 +135,10 @@ class BlockPool:
    Args:
        num_gpu_blocks: The number of blocks in the pool.
        enable_caching: Whether to enable prefix caching.
+        hash_block_size: The block size of which the block hashes are computed.
+            The actual block size usually equals hash_block_size, but in cases
+            where different KV cache groups have different block sizes, the
+            actual block size can be a multiple of hash_block_size.
        enable_kv_cache_events: Whether to enable kv cache events.
    """

@ -140,11 +146,13 @@ class BlockPool:
        self,
        num_gpu_blocks: int,
        enable_caching: bool,
+        hash_block_size: int,
        enable_kv_cache_events: bool = False,
    ):
        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
        self.num_gpu_blocks = num_gpu_blocks
        self.enable_caching = enable_caching
+        self.hash_block_size = hash_block_size
        # All kv-cache blocks.
        self.blocks: list[KVCacheBlock] = [
            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
@ -223,8 +231,20 @@ class BlockPool:
            return
        new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
        assert len(request.block_hashes) >= num_full_blocks
-        new_block_hashes = request.block_hashes[num_cached_blocks:]
+        if block_size == self.hash_block_size:
+            # Common case.
+            block_hashes: BlockHashList = request.block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when
+            # different KV cache groups have different block sizes.
+            assert block_size % self.hash_block_size == 0
+            # Recalculate block_hashes at the granularity of block_size, using
+            # the original block_hashes (at the granularity of hash_block_size).
+            block_hashes = BlockHashListWithBlockSize(
+                request.block_hashes, self.hash_block_size, block_size
+            )

+        new_block_hashes = block_hashes[num_cached_blocks:]
        new_hashes: list[ExternalBlockHash] | None = (
            [] if self.enable_kv_cache_events else None
        )
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@ -2,15 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
+from math import lcm

 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
+    KVCacheBlock,
+)
 from vllm.v1.core.single_type_kv_cache_manager import (
    CrossAttentionManager,
    FullAttentionManager,
    get_manager_for_kv_cache_spec,
 )
-from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheSpec,
+)
 from vllm.v1.request import Request


@ -28,13 +38,17 @@ class KVCacheCoordinator(ABC):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
+        hash_block_size: int,
    ):
        self.kv_cache_config = kv_cache_config
        self.max_model_len = max_model_len
        self.enable_caching = enable_caching

        self.block_pool = BlockPool(
-            kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events
+            kv_cache_config.num_blocks,
+            enable_caching,
+            hash_block_size,
+            enable_kv_cache_events,
        )

        # Needs special handling for find_longest_cache_hit if eagle is enabled
@ -213,6 +227,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
+        hash_block_size: int,
    ):
        super().__init__(
            kv_cache_config,
@ -222,6 +237,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
            enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
        )
        self.num_single_type_manager = len(self.single_type_managers)

@ -255,6 +271,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
+        hash_block_size: int,
    ):
        super().__init__(
            kv_cache_config,
@ -264,6 +281,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
            enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
        )
        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
        self.block_size = self.kv_cache_spec.block_size
@ -273,6 +291,11 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
            self.block_size *= dcp_world_size
        if pcp_world_size > 1:
            self.block_size *= pcp_world_size
+        # For models using only Mamba, block_size is set to max_model_len when
+        # prefix caching is disabled, and hash_block_size validation is skipped.
+        assert not enable_caching or (hash_block_size == self.block_size), (
+            "UnitaryKVCacheCoordinator assumes hash_block_size == block_size"
+        )
        assert len(self.kv_cache_config.kv_cache_groups) == 1, (
            "UnitaryKVCacheCoordinator assumes only one kv cache group"
        )
@ -289,6 +312,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
            block_pool=self.block_pool,
            kv_cache_spec=self.kv_cache_spec,
            use_eagle=self.use_eagle,
+            alignment_tokens=self.block_size,
            dcp_world_size=self.dcp_world_size,
            pcp_world_size=self.pcp_world_size,
        )
@ -313,6 +337,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
+        hash_block_size: int,
    ):
        super().__init__(
            kv_cache_config,
@ -322,7 +347,17 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
            enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
        )
+        # hash_block_size: the block size used to compute block hashes.
+        # The actual block size usually equals hash_block_size, but in cases where
+        # different KV cache groups have different block sizes, the actual block size
+        # can be a multiple of hash_block_size.
+        self.hash_block_size = hash_block_size
+        assert all(
+            g.kv_cache_spec.block_size % hash_block_size == 0
+            for g in kv_cache_config.kv_cache_groups
+        ), "block_size must be divisible by hash_block_size"
        assert dcp_world_size == 1, "DCP not support hybrid attn now."
        assert pcp_world_size == 1, "PCP not support hybrid attn now."
        self.verify_and_split_kv_cache_groups()
@ -373,14 +408,12 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
        self.other_spec = other_spec
        self.full_attention_block_size = self.full_attention_spec.block_size
        self.other_block_size = self.other_spec.block_size
-
-        if self.enable_caching:
-            # this requirement is only needed for the prefix caching logic
-            divisible = self.other_block_size % self.full_attention_block_size
-            assert divisible == 0, (
-                "KVCacheCoordinator assumes the block_size of full "
-                "attention layers is divisible by other layers now."
-            )
+        # The LCM of the block sizes of full attention and other attention.
+        # The cache hit length must be a multiple of the LCM of the block sizes
+        # to make sure the cache hit length is a multiple of the block size of
+        # each attention type. Requiring this because we don't support partial
+        # block cache hit yet.
+        self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size)

        if max(self.full_attention_group_ids) < min(self.other_group_ids):
            self.full_attn_first = True
@ -414,25 +447,48 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
                - The number of tokens of the longest cache hit.
        """
        # First, find the longest cache hit for full attention.
+        if self.full_attention_spec.block_size == self.hash_block_size:
+            # Common case.
+            full_attention_block_hashes: BlockHashList = block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when different
+            # KV cache groups have different block sizes. In this case, we need to
+            # recalculate block_hashes at the granularity of block_size, using the
+            # original block_hashes (at the granularity of hash_block_size).
+            full_attention_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.full_attention_spec.block_size
+            )
        hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=full_attention_block_hashes,
            max_length=max_cache_hit_length,
            kv_cache_group_ids=self.full_attention_group_ids,
            block_pool=self.block_pool,
            kv_cache_spec=self.full_attention_spec,
            use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
        )
        hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size

        # Next, find the cache hit for the other attention WITHIN
        # the cache hit of full attention.
+        if self.other_spec.block_size == self.hash_block_size:
+            # Common case.
+            other_block_hashes: BlockHashList = block_hashes
+        else:
+            # Similar to the full attention case, here we need to recalculate
+            # block_hashes at the granularity of block_size, using the original
+            # block_hashes (at the granularity of hash_block_size).
+            other_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.other_spec.block_size
+            )
        hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=other_block_hashes,
            max_length=hit_length,
            kv_cache_group_ids=self.other_group_ids,
            block_pool=self.block_pool,
            kv_cache_spec=self.other_spec,
            use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
        )
        hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size

@ -466,6 +522,7 @@ def get_kv_cache_coordinator(
    enable_kv_cache_events: bool,
    dcp_world_size: int,
    pcp_world_size: int,
+    hash_block_size: int,
 ) -> KVCacheCoordinator:
    if not enable_caching:
        return KVCacheCoordinatorNoPrefixCache(
@ -473,8 +530,9 @@ def get_kv_cache_coordinator(
            max_model_len,
            use_eagle,
            enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
        )
    if len(kv_cache_config.kv_cache_groups) == 1:
        return UnitaryKVCacheCoordinator(
@ -483,8 +541,9 @@ def get_kv_cache_coordinator(
            use_eagle,
            enable_caching,
            enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
        )
    return HybridKVCacheCoordinator(
        kv_cache_config,
@ -492,6 +551,7 @@ def get_kv_cache_coordinator(
        use_eagle,
        enable_caching,
        enable_kv_cache_events,
-        dcp_world_size=dcp_world_size,
-        pcp_world_size=pcp_world_size,
+        dcp_world_size,
+        pcp_world_size,
+        hash_block_size,
    )
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@ -95,6 +95,7 @@ class KVCacheManager:
        self,
        kv_cache_config: KVCacheConfig,
        max_model_len: int,
+        hash_block_size: int,
        enable_caching: bool = True,
        use_eagle: bool = False,
        log_stats: bool = False,
@ -107,28 +108,11 @@ class KVCacheManager:
        self.enable_caching = enable_caching
        self.use_eagle = use_eagle
        self.log_stats = log_stats
-        # FIXME: make prefix cache stats conditional on log_stats
+        # FIXME: make prefix cache stats conditional on log_stats. We still need
+        # this comment because when the log stats is enabled there are still
+        # potential configs we could expose in the future.
        self.prefix_cache_stats = PrefixCacheStats() if log_stats else None

-        self.block_size: int | None = None
-        if self.enable_caching:
-            assert (
-                len(
-                    set(
-                        g.kv_cache_spec.block_size
-                        for g in kv_cache_config.kv_cache_groups
-                    )
-                )
-                == 1
-            ), "Only one block size is supported for now"
-            self.block_size = kv_cache_config.kv_cache_groups[
-                0
-            ].kv_cache_spec.block_size
-
-            if dcp_world_size * pcp_world_size > 1:
-                assert len(kv_cache_config.kv_cache_groups) == 1
-                self.block_size *= dcp_world_size * pcp_world_size
-
        self.coordinator = get_kv_cache_coordinator(
            kv_cache_config=kv_cache_config,
            max_model_len=self.max_model_len,
@ -137,6 +121,7 @@ class KVCacheManager:
            enable_kv_cache_events=enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
        )
        self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
        self.block_pool = self.coordinator.block_pool
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -5,9 +5,9 @@
 import copy
 import os
 from collections import defaultdict
-from collections.abc import Callable, Iterable, Sequence
-from dataclasses import dataclass
-from typing import Any, NewType, TypeAlias
+from collections.abc import Callable, Iterable, Iterator, Sequence
+from dataclasses import dataclass, replace
+from typing import Any, NewType, TypeAlias, overload

 from vllm import envs
 from vllm.config import VllmConfig
@ -825,11 +825,11 @@ def get_num_blocks(
    return num_blocks


-def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
    """
    Get the page size of the KV cache.
    """
-    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_specs}
    assert len(page_sizes) == 1
    return page_sizes.pop()

@ -882,6 +882,46 @@ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool
    return len(page_sizes) == 1


+def unify_kv_cache_spec_page_size(
+    kv_cache_spec: dict[str, KVCacheSpec],
+) -> dict[str, KVCacheSpec]:
+    """
+    Unify the page size of the given KVCacheSpec. If the page size of all layers
+    are the same, return the original KVCacheSpec. If not same, unify the page
+    size by increasing the block size of layers with smaller page size. Raise
+    NotImplementedError if failed to unify the page size.
+
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        The updated KVCacheSpec with the same page_size_bytes.
+    """
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    if len(page_sizes) <= 1:
+        # All layers have the same page size, no need to unify.
+        return kv_cache_spec
+
+    max_page_size = max(page_sizes)
+    new_kv_cache_spec = {}
+    for layer_name, layer_spec in kv_cache_spec.items():
+        if layer_spec.page_size_bytes == max_page_size:
+            new_kv_cache_spec[layer_name] = layer_spec
+        else:
+            layer_page_size = layer_spec.page_size_bytes
+            if max_page_size % layer_page_size != 0:
+                raise NotImplementedError(
+                    "The page size of the layer is not divisible by the "
+                    "maximum page size. Cannot unify by adjusting block_size."
+                )
+            ratio = max_page_size // layer_page_size
+            new_block_size = layer_spec.block_size * ratio
+            new_spec = replace(layer_spec, block_size=new_block_size)
+            assert new_spec.page_size_bytes == max_page_size
+            new_kv_cache_spec[layer_name] = new_spec
+    return new_kv_cache_spec
+
+
 def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
    # kv_cache_spec is an empty dict for attention free models
    return not kv_cache_spec
@ -1010,7 +1050,6 @@ def _get_kv_cache_groups_uniform_page_size(
 def get_kv_cache_config_from_groups(
    vllm_config: VllmConfig,
    kv_cache_groups: list[KVCacheGroupSpec],
-    kv_cache_specs: dict[str, KVCacheSpec],
    available_memory: int,
 ) -> KVCacheConfig:
    """
@ -1020,7 +1059,6 @@ def get_kv_cache_config_from_groups(
    Args:
        vllm_config: The global VllmConfig
        kv_cache_groups: The KV cache groups
-        kv_cache_specs: The KV cache spec of each attention layer in the model
        available_memory: Memory available for KV cache in bytes
    Returns:
        The generated KVCacheConfig
@ -1064,7 +1102,9 @@ def get_kv_cache_config_from_groups(
        # full.1, sw.2: share another Tensor with size=available_memory//2
        group_size = max(len(group.layer_names) for group in kv_cache_groups)

-        page_size = get_uniform_page_size(kv_cache_specs)
+        page_size = get_uniform_page_size(
+            [group.kv_cache_spec for group in kv_cache_groups]
+        )
        assert group_size > 0, "group_size must be greater than 0"
        num_blocks = get_num_blocks(
            vllm_config, group_size, available_memory, page_size
@ -1166,7 +1206,8 @@ def get_kv_cache_groups(
        # This returns an empty list to allow for the KVCacheManager to handle
        # attention free models.
        return []
-    elif is_kv_cache_spec_uniform(kv_cache_spec):
+
+    if is_kv_cache_spec_uniform(kv_cache_spec):
        # KV cache of all layers are the same, which is true for
        # most models. Allocate the same amount of memory for
        # each layer.
@ -1176,14 +1217,16 @@ def get_kv_cache_groups(
        # full attention, or all layers are sliding window attention with the
        # same window size). Put all layers into one group.
        return _get_kv_cache_groups_uniform_type(uniform_spec)
-    elif is_kv_cache_page_size_uniform(kv_cache_spec):
-        # Model contains multiple attention types, but KV cache of all layers
-        # have the same physical memory per block per layer. Split the layers
-        # into groups with the same number of layers, and thus same total page
-        # size.
-        return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)

-    raise NotImplementedError
+    # As KVCacheManager can only allocate memory of one size, we need to unify
+    # the page size of the layers. For cases cannot be unified, this function
+    # will raise an error.
+    kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec)
+    # Model contains multiple attention types, but KV cache of all layers
+    # have the same physical memory per block per layer. Split the layers
+    # into groups with the same number of layers, and thus same total page
+    # size.
+    return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)


 def generate_scheduler_kv_cache_config(
@ -1327,10 +1370,7 @@ def get_kv_cache_configs(
        ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
        kv_cache_configs.append(
            get_kv_cache_config_from_groups(
-                vllm_config,
-                kv_cache_groups_one_worker,
-                kv_cache_spec_one_worker,
-                available_memory_one_worker,
+                vllm_config, kv_cache_groups_one_worker, available_memory_one_worker
            )
        )

@ -1353,3 +1393,79 @@ def get_kv_cache_configs(
            _report_kv_cache_config(vllm_config, kv_cache_config)

    return kv_cache_configs
+
+
+class BlockHashListWithBlockSize:
+    """
+    Convert block-hash granularity from `hash_block_size` to `target_block_size`.
+    Used when KV cache groups have different block sizes: `hash_block_size`
+    is the size used to compute the original `block_hashes`; `target_block_size`
+    is the group's actual block size.
+
+    Currently, only scaling up by an integer factor is supported (i.e.,
+    `target_block_size` is a multiple of `hash_block_size`). Conversion is
+    performed lazily on access for efficiency, by concatenating consecutive
+    hashes at `hash_block_size` to form each hash at `target_block_size`.
+
+    Example (`hash_block_size` = 16, `target_block_size` = 32):
+    concatenating two 16-size hashes yields one 32-size hash:
+
+    Block hashes with block_size 16:
+    | Token Range | 0-15 | 16-31 | 32-47 | 48-63 |
+    |-------------|------|-------|-------|-------|
+    | Hash        | A    | B     | C     | D     |
+
+    Block hashes with block_size 32:
+    | Token Range | 0-31 | 32-63 |
+    |-------------|------|-------|
+    | Hash        | AB   | CD    |
+
+    Args:
+        block_hashes: Block hashes to convert, computed at `hash_block_size`.
+        hash_block_size: Block size at which `block_hashes` were computed.
+        target_block_size: Desired block size; must be a multiple of `hash_block_size`.
+    """
+
+    def __init__(
+        self,
+        block_hashes: list[BlockHash],
+        hash_block_size: int,
+        target_block_size: int,
+    ):
+        self.block_hashes = block_hashes
+        assert target_block_size % hash_block_size == 0
+        self.scale_factor = target_block_size // hash_block_size
+
+    def __len__(self) -> int:
+        return len(self.block_hashes) // self.scale_factor
+
+    @overload
+    def __getitem__(self, idx: int) -> BlockHash: ...
+
+    @overload
+    def __getitem__(self, idx: slice) -> list[BlockHash]: ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            return self._get_value_at(idx)
+
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            return [self._get_value_at(i) for i in range(start, stop, step)]
+
+        raise TypeError(f"Invalid index type: {type(idx)!r}")
+
+    def __iter__(self) -> Iterator[BlockHash]:
+        for i in range(len(self)):
+            yield self._get_value_at(i)
+
+    def _get_value_at(self, idx: int) -> BlockHash:
+        base = idx * self.scale_factor
+        end = base + self.scale_factor
+        merged_hash: bytes = self.block_hashes[base]
+        for i in range(base + 1, end):
+            merged_hash += self.block_hashes[i]
+        return BlockHash(merged_hash)
+
+
+BlockHashList = list[BlockHash] | BlockHashListWithBlockSize
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@ -186,6 +186,7 @@ class Scheduler(SchedulerInterface):
            enable_kv_cache_events=self.enable_kv_cache_events,
            dcp_world_size=self.dcp_world_size,
            pcp_world_size=self.pcp_world_size,
+            hash_block_size=self.block_size,
        )
        sink_len = getattr(vllm_config.model_config.hf_config, "param_sink_number", 0)
        if sink_len > 0:
@ -1093,8 +1094,6 @@ class Scheduler(SchedulerInterface):
                and request.sampling_params.logprobs is not None
                and logprobs
            ):
-                # NOTE: once we support N tokens per step (spec decode),
-                # the outer lists can be of length > 1.
                new_logprobs = logprobs.slice(req_index, req_index + 1)

            if new_token_ids and self.structured_output_manager.should_advance(request):
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@ -7,7 +7,7 @@ from collections.abc import Sequence

 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock
 from vllm.v1.kv_cache_interface import (
    ChunkedLocalAttentionSpec,
    CrossAttentionSpec,
@ -208,12 +208,13 @@ class SingleTypeKVCacheManager(ABC):
    @abstractmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -233,6 +234,11 @@ class SingleTypeKVCacheManager(ABC):
            block_pool: The block pool.
            kv_cache_spec: The kv cache spec.
            use_eagle: Whether to use eagle.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens). By default, it should
+                be set to the block_size.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.

        Returns:
            A list of cached blocks with skipped blocks replaced by null block
@ -300,18 +306,19 @@ class FullAttentionManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        assert isinstance(
            kv_cache_spec,
-            (FullAttentionSpec, FullDiffkvAttentionSpec, ChunkedLocalAttentionSpec),
+            FullAttentionSpec | ChunkedLocalAttentionSpec | FullDiffkvAttentionSpec
        ), (
            "FullAttentionManager can only be used for full attention "
            "and chunked local attention groups"
@ -335,6 +342,13 @@ class FullAttentionManager(SingleTypeKVCacheManager):
            else:
                break
        if use_eagle and computed_blocks[0]:
+            # Need to drop the last matched block if eagle is enabled.
+            for computed in computed_blocks:
+                computed.pop()
+        while (
+            block_size != alignment_tokens  # Faster for common case.
+            and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+        ):
            for computed in computed_blocks:
                computed.pop()
        return computed_blocks
@ -361,12 +375,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -398,6 +413,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
            [block_pool.null_block] * max_num_blocks
            for _ in range(len(kv_cache_group_ids))
        )
+        block_size = kv_cache_spec.block_size
        num_contiguous_blocks = 0
        match_found = False
        # Search from right to left and early stop when a match is found.
@ -405,6 +421,15 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
            if cached_block := block_pool.get_cached_block(
                block_hashes[i], kv_cache_group_ids
            ):
+                # Skip prefix matching check if the block is not aligned with
+                # `alignment_tokens`.
+                if (
+                    num_contiguous_blocks == 0
+                    and block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
+                # Add the cached block to the computed blocks.
                for computed, cached in zip(computed_blocks, cached_block):
                    computed[i] = cached
                num_contiguous_blocks += 1
@ -423,7 +448,16 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
            # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
            for computed in computed_blocks:
                del computed[num_contiguous_blocks:]
+            while (
+                block_size != alignment_tokens  # Faster for common case.
+                and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+            ):
+                for computed in computed_blocks:
+                    computed.pop()
        if use_eagle and computed_blocks[0]:
+            assert kv_cache_spec.block_size == alignment_tokens, (
+                "aligned_length is not compatible with eagle now"
+            )
            for computed in computed_blocks:
                computed.pop()
        return computed_blocks
@ -477,12 +511,13 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -513,6 +548,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
            block_pool: The block pool.
            kv_cache_spec: The kv cache spec.
            use_eagle: Whether to use eagle.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens).

        Returns:
            A list of cached blocks
@ -526,6 +565,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
        )
        assert dcp_world_size == 1, "DCP not support chunked local attn now."
        assert pcp_world_size == 1, "PCP not support chunked local attn now."
+        assert kv_cache_spec.block_size == alignment_tokens, (
+            "KV cache groups with different block sizes are not compatible with "
+            "chunked local attention now"
+        )
        max_num_blocks = max_length // kv_cache_spec.block_size
        if max_length > 0:
            local_attention_start_idx = (
@ -614,12 +657,13 @@ class MambaManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -632,12 +676,21 @@ class MambaManager(SingleTypeKVCacheManager):
            [] for _ in range(len(kv_cache_group_ids))
        )

-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
+        max_num_blocks = max_length // block_size
        # Search from right to left and early stop when a match is found.
        for i in range(max_num_blocks - 1, -1, -1):
            if cached_block := block_pool.get_cached_block(
                block_hashes[i], kv_cache_group_ids
            ):
+                # When enable Mamba prefix caching, `block_size` will be aligned
+                # across full attention layers and Mamba layers to ensure the
+                # prefix hit length aligned at block
+                if (
+                    block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
                for computed, cached in zip(computed_blocks, cached_block):
                    # the hit length logic later assumes:
                    #  hit_length = len(hit_blocks_other_attn[0])
@ -710,12 +763,13 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -31,7 +31,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
-from vllm.utils.func_utils import deprecate_kwargs
 from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
@ -195,12 +194,6 @@ class AsyncLLM(EngineClient):
            self.profiler = None

    @classmethod
-    @deprecate_kwargs(
-        "disable_log_requests",
-        additional_message=(
-            "This argument will have no effect. Use `enable_log_requests` instead."
-        ),
-    )
    def from_vllm_config(
        cls,
        vllm_config: VllmConfig,
@ -213,7 +206,6 @@ class AsyncLLM(EngineClient):
        client_addresses: dict[str, str] | None = None,
        client_count: int = 1,
        client_index: int = 0,
-        disable_log_requests: bool = True,  # Deprecated, will be removed
    ) -> "AsyncLLM":
        # Create the LLMEngine.
        return cls(
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Sequence
 from dataclasses import replace

 import torch
@ -204,7 +205,9 @@ class RejectionSampler(nn.Module):
    def parse_output(
        output_token_ids: torch.Tensor,
        vocab_size: int,
-    ) -> list[list[int]]:
+        discard_req_indices: Sequence[int] = (),
+        return_cu_num_tokens: bool = False,
+    ) -> tuple[list[list[int]], list[int] | None]:
        """Parse the output of the rejection sampler.
        Args:
            output_token_ids: The sampled token IDs in shape
@ -212,6 +215,8 @@ class RejectionSampler(nn.Module):
                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                and will be filtered out in this function.
            vocab_size: The size of the vocabulary.
+            discard_req_indices: Optional row indices to discard tokens in.
+            return_cu_num_tokens: Whether to also return cumulative token counts.
        Returns:
            A list of lists of token IDs.
        """
@ -220,10 +225,15 @@ class RejectionSampler(nn.Module):
        valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
            output_token_ids_np < vocab_size
        )
+        cu_num_tokens = None
+        if return_cu_num_tokens:
+            cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist()
+        if len(discard_req_indices) > 0:
+            valid_mask[discard_req_indices] = False
        outputs = [
            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
        ]
-        return outputs
+        return outputs, cu_num_tokens

    def apply_logits_processors(
        self,
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -1055,11 +1055,11 @@ class EagleProposer:
                elif (
                    isinstance(target_embed_tokens.weight, torch.Tensor)
                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
-                    and torch.allclose(
+                    # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                    # usage in CI testing environments with limited GPU memory
+                    and torch.equal(
                        target_embed_tokens.weight.cpu(),
                        self.model.model.embed_tokens.weight.cpu(),
-                        rtol=1e-5,
-                        atol=1e-7,
                    )
                ):
                    share_embeddings = True
@ -1105,8 +1105,11 @@ class EagleProposer:
                hasattr(target_language_model, "lm_head")
                and isinstance(target_language_model.lm_head.weight, torch.Tensor)
                and isinstance(self.model.lm_head.weight, torch.Tensor)
+                # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                # usage in CI testing environments with limited GPU memory
                and torch.equal(
-                    target_language_model.lm_head.weight, self.model.lm_head.weight
+                    target_language_model.lm_head.weight.cpu(),
+                    self.model.lm_head.weight.cpu(),
                )
            ):
                share_lm_head = True
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -186,7 +186,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
        self,
        model_runner_output: ModelRunnerOutput,
        sampled_token_ids: torch.Tensor,
-        logprobs_tensors: torch.Tensor | None,
+        logprobs_tensors: LogprobsTensors | None,
        invalid_req_indices: list[int],
        async_output_copy_stream: torch.cuda.Stream,
        vocab_size: int,
@ -222,28 +222,29 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):

        This function blocks until the copy is finished.
        """
+        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
        self.async_copy_ready_event.synchronize()

        # Release the device tensors once the copy has completed.
        del self._logprobs_tensors
        del self._sampled_token_ids
-        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
        if max_gen_len == 1:
            valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+            for i in self._invalid_req_indices:
+                valid_sampled_token_ids[i].clear()
+            cu_num_tokens = None
        else:
-            valid_sampled_token_ids = RejectionSampler.parse_output(
+            valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                self.sampled_token_ids_cpu,
                self.vocab_size,
+                self._invalid_req_indices,
+                return_cu_num_tokens=self._logprobs_tensors_cpu is not None,
            )
-        for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()

        output = self._model_runner_output
        output.sampled_token_ids = valid_sampled_token_ids
        if self._logprobs_tensors_cpu:
-            # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens
-            # for async sched + spec decode + logprobs compatibility.
-            output.logprobs = self._logprobs_tensors_cpu.tolists()
+            output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens)
        return output


@ -2629,28 +2630,24 @@ class GPUModelRunner(
        sampled_token_ids = sampler_output.sampled_token_ids
        logprobs_tensors = sampler_output.logprobs_tensors
        invalid_req_indices = []
-        cu_num_new_tokens: list[int] | None = None
+        cu_num_tokens: list[int] | None = None
        if not self.use_async_scheduling:
            # Get the valid generated tokens.
            max_gen_len = sampled_token_ids.shape[-1]
            if max_gen_len == 1:
                # No spec decode tokens.
                valid_sampled_token_ids = self._to_list(sampled_token_ids)
+                # Mask out the sampled tokens that should not be sampled.
+                for i in discard_sampled_tokens_req_indices:
+                    valid_sampled_token_ids[int(i)].clear()
            else:
                # Includes spec decode tokens.
-                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                    sampled_token_ids,
                    self.input_batch.vocab_size,
+                    discard_sampled_tokens_req_indices,
+                    return_cu_num_tokens=logprobs_tensors is not None,
                )
-                if logprobs_tensors:
-                    # Needed for extracting logprobs when spec decoding.
-                    # This must be done prior to discarding sampled tokens.
-                    cu_num_new_tokens = [0]
-                    for toks in valid_sampled_token_ids:
-                        cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks))
-            # Mask out the sampled tokens that should not be sampled.
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
        else:
            valid_sampled_token_ids = []
            invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@ -2704,7 +2701,7 @@ class GPUModelRunner(
            req_state.output_token_ids.extend(sampled_ids)

        logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_new_tokens)
+            logprobs_tensors.tolists(cu_num_tokens)
            if not self.use_async_scheduling and logprobs_tensors is not None
            else None
        )
@ -2824,7 +2821,7 @@ class GPUModelRunner(
                        # returns True. before returning early here we call
                        # dummy run to ensure coordinate_batch_across_dp
                        # is called into to avoid out of sync issues.
-                        self._dummy_run(1)
+                        self._dummy_run(self._get_num_input_tokens(1))
                    if not has_kv_transfer_group():
                        # Return empty ModelRunnerOutput if no work to do.
                        return EMPTY_MODEL_RUNNER_OUTPUT
@ -3495,6 +3492,10 @@ class GPUModelRunner(
            scope="local",
        )
        prepare_communication_buffer_for_model(self.model)
+        if (drafter := getattr(self, "drafter", None)) and (
+            drafter_model := getattr(drafter, "model", None)
+        ):
+            prepare_communication_buffer_for_model(drafter_model)
        mm_config = self.model_config.multimodal_config
        self.is_multimodal_pruning_enabled = (
            supports_multimodal_pruning(self.get_model())
@ -4277,14 +4278,18 @@ class GPUModelRunner(
                    # NOTE: This happens when encoder cache needs to store
                    # the embeddings that encoder outputs are scattered onto.
                    # In this case we create dummy embeddings of size
-                    # (encode_budget, hidden_size) and scatter encoder
-                    # output into it.
+                    # (max_tokens_for_modality, hidden_size) and scatter
+                    # encoder output into it.
                    encoder_output_shape = dummy_encoder_outputs[0].shape
-                    if encoder_output_shape[0] < encoder_budget:
+                    max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
+                        dummy_modality
+                    ]
+                    if encoder_output_shape[0] < max_mm_tokens_per_item:
+                        encoder_hidden_size = encoder_output_shape[-1]
                        expanded_outputs = []
                        for output in dummy_encoder_outputs:
                            expanded = output.new_zeros(
-                                (encoder_budget, encoder_output_shape[-1])
+                                (max_mm_tokens_per_item, encoder_hidden_size)
                            )
                            num_tokens = output.shape[0]
                            expanded[:num_tokens].copy_(output)