Merge branch 'main' into Add_support_for_openpangu_promoe_v2

Signed-off-by: yt0428 <51468697+yt0428@users.noreply.github.com>
2026-07-13 00:37:31 +08:00 · 2025-11-26 11:41:50 +08:00 · 2025-11-26 11:41:50 +08:00 · 28169a6fce
commit 28169a6fce
parent 315e3f654a 53d7f1f601
77 changed files with 1685 additions and 441 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -903,11 +903,12 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -13,7 +13,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
      - name: Set up Python
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@ -12,7 +12,7 @@ jobs:
    timeout-minutes: 30
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
      - uses: astral-sh/setup-uv@v7
        with:
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -16,7 +16,7 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
      with:
        python-version: "3.12"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@ -22,6 +22,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include "cutlass_extensions/common.hpp"
 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
 }
 template <typename OutType>
-void run_fp4_blockwise_scaled_group_mm(
+void run_fp4_blockwise_scaled_group_mm_sm100(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
  auto can_implement_status = gemm_op.can_implement(args);
  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
+              "Failed to implement GEMM: status=", (int)can_implement_status);
  // Run the GEMM
  auto status = gemm_op.initialize(args, workspace.data_ptr());
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
              "Failed to initialize GEMM: status=", (int)status,
              " workspace_size=", workspace_size, " num_experts=", num_experts,
              " M=", M, " N=", N, " K=", K);
  status = gemm_op.run(args, workspace.data_ptr(), stream);
  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 void run_fp4_blockwise_scaled_group_mm_sm120(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
    int N, int K) {
  using ProblemShape =
      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
  using ElementType = cutlass::float_e2m1_t;
  using ElementSFType = cutlass::float_ue4m3_t;
  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
  // NOTE: For SM120 it seems templating the output type is not supported and
  // we need to hardcode the output type to bfloat16
  using ElementC = cutlass::bfloat16_t;
  using ElementD = ElementC;
  using ElementAccumulator = float;
  // Layout definitions
  using LayoutA = cutlass::layout::RowMajor;
  using LayoutB = cutlass::layout::ColumnMajor;
  using LayoutC = cutlass::layout::RowMajor;
  using LayoutD = LayoutC;
  // Alignment constraints
  static constexpr int AlignmentA = 32;
  static constexpr int AlignmentB = 32;
  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
  // Architecture definitions
  using ArchTag = cutlass::arch::Sm120;
  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
  using ClusterShape = Shape<_1, _1, _1>;
  using MmaTileShape = Shape<_128, _128, _128>;
  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
      ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, MmaTileShape, ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
          LayoutD*, AlignmentD,
          cutlass::epilogue::collective::EpilogueScheduleAuto,
          FusionOperation>::CollectiveOp;
  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
          LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
  using GemmKernel =
      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
                                           CollectiveEpilogue>;
  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
  using LayoutSFA =
      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
  using LayoutSFB =
      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
  using ScaleConfig =
      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
  int num_experts = static_cast<int>(expert_offsets.size(0));
  auto options_int =
      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
  torch::Tensor c_strides1 =
      torch::full({num_experts}, output.stride(0), options_int);
  torch::Tensor a_strides1 =
      torch::full({num_experts}, a.stride(0) * 2, options_int);
  torch::Tensor b_strides1 =
      torch::full({num_experts}, b.stride(1) * 2, options_int);
  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
      expert_offsets, sf_offsets, problem_sizes, M, N, K);
  // Create an instance of the GEMM
  Gemm gemm_op;
  // Initialize problem_sizes_as_shapes correctly
  UnderlyingProblemShape* problem_sizes_as_shapes =
      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
  // Set the Scheduler info
  cutlass::KernelHardwareInfo hw_info;
  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
  scheduler.raster_order = RasterOrderOptions::AlongM;
  hw_info.device_id = a.get_device();
  static std::unordered_map<int, int> cached_sm_counts;
  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
    cached_sm_counts[hw_info.device_id] =
        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
            hw_info.device_id);
  }
  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
  // Mainloop Arguments
  typename GemmKernel::MainloopArguments mainloop_args{
      static_cast<const ElementType**>(a_ptrs.data_ptr()),
      static_cast<StrideA*>(a_strides1.data_ptr()),
      static_cast<const ElementType**>(b_ptrs.data_ptr()),
      static_cast<StrideB*>(b_strides1.data_ptr()),
      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
  // Epilogue Arguments
  typename GemmKernel::EpilogueArguments epilogue_args{
      {},  // epilogue.thread
      nullptr,
      static_cast<StrideC*>(c_strides1.data_ptr()),
      static_cast<ElementD**>(out_ptrs.data_ptr()),
      static_cast<StrideC*>(c_strides1.data_ptr())};
  auto& fusion_args = epilogue_args.thread;
  fusion_args.alpha_ptr_array =
      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
  fusion_args.dAlpha = {_0{}, _0{}, 1};
  fusion_args.beta = 0.0f;
  // Gemm Arguments
  typename GemmKernel::Arguments args{
      cutlass::gemm::GemmUniversalMode::kGrouped,
      {num_experts, problem_sizes_as_shapes, nullptr},
      mainloop_args,
      epilogue_args,
      hw_info,
      scheduler};
  size_t workspace_size = Gemm::get_workspace_size(args);
  auto const workspace_options =
      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
  auto workspace = torch::empty(workspace_size, workspace_options);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
  auto can_implement_status = gemm_op.can_implement(args);
  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
              "Failed to implement GEMM: status=", (int)can_implement_status);
  // Run the GEMM
  auto status = gemm_op.initialize(args, workspace.data_ptr());
  TORCH_CHECK(status == cutlass::Status::kSuccess,
              "Failed to initialize GEMM: status=", (int)status,
              " workspace_size=", workspace_size, " num_experts=", num_experts,
              " M=", M, " N=", N, " K=", K);
  status = gemm_op.run(args, workspace.data_ptr(), stream);
  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 template <typename OutType>
 void run_fp4_blockwise_scaled_group_mm(
    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
    int N, int K) {
  int32_t version_num = get_sm_version_num();
 #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
  if (version_num >= 120 && version_num < 130) {
    run_fp4_blockwise_scaled_group_mm_sm120(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
    return;
  }
 #endif
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
  if (version_num >= 100 && version_num < 120) {
    run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
    return;
  }
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(
      false,
      "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
      version_num, ". Required capability: 100 or 120");
 }
 #if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
 #endif
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
  // Input validation
  CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
  CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
  } else {
  #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
    int32_t version_num = get_sm_version_num();
    if (version_num >= 120 && version_num < 130) {
      TORCH_CHECK_NOT_IMPLEMENTED(
          false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
          output.scalar_type());
    }
  #endif
    run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
        expert_offsets, sf_offsets, M, N, K);
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
  TORCH_CHECK_NOT_IMPLEMENTED(
      false,
      "No compiled cutlass_fp4_group_mm kernel, vLLM must "
-      "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
+      "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
-      "12.8 or above.");
+      "and CUDA 12.8 or above.");
 #endif
 }
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
 constexpr auto INT = at::ScalarType::Int;
 constexpr auto UINT8 = at::ScalarType::Byte;
-void scaled_fp4_experts_quant_sm100a(
+void scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input_sf);
 #endif
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-void scaled_fp4_experts_quant_sm100a(
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
    torch::Tensor const& input_offset_by_experts,
    torch::Tensor const& output_scale_offset_by_experts) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-  return scaled_fp4_experts_quant_sm100a(
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
  return scaled_fp4_experts_quant_sm1xxa(
      output, output_scale, input, input_global_scale, input_offset_by_experts,
      output_scale_offset_by_experts);
 #endif
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                             std::optional<torch::Tensor> const& bias);
 #endif
-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
+#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) ||   \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
+    (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
-    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
+    (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
 void get_cutlass_moe_mm_data_caller(
    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, input_permutation,
                                 output_permutation, num_experts, n, k,
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
      false,
      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
      "CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 void get_cutlass_moe_mm_problem_sizes(
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                          problem_sizes2, num_experts, n, k,
                                          blockscale_offsets);
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
      false,
      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
      "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                      problem_sizes2, expert_num_tokens,
                                      num_local_experts, padded_m, n, k);
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
      false,
      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
      "for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -133,7 +133,7 @@ def main(args):
        tensor_parallel_size=args.tp,
        enable_chunked_prefill=args.enable_chunked_prefill,
        enforce_eager=args.enforce_eager,
-        gpu_memory_utilization=0.8,
+        gpu_memory_utilization=0.9,
        speculative_config=speculative_config,
        disable_log_stats=False,
        max_model_len=args.max_model_len,
--- a/tests/compile/fullgraph/test_simple.py
+++ b/tests/compile/fullgraph/test_simple.py
@ -55,7 +55,7 @@ class SillyModel(nn.Module):
 def _run_simple_model(
    splitting_ops,
    use_inductor_graph_partition,
-    use_inductor,
+    backend,
    expected_num_piecewise_graphs_seen,
    expected_num_piecewise_capturable_graphs_seen,
    expected_num_backend_compilations,
@ -64,7 +64,7 @@ def _run_simple_model(
    vllm_config = VllmConfig(
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
-            use_inductor=use_inductor,
+            backend=backend,
            splitting_ops=splitting_ops,
            use_inductor_graph_partition=use_inductor_graph_partition,
            cudagraph_copy_inputs=True,
@ -124,14 +124,14 @@ def _run_simple_model(
        assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
-@pytest.mark.parametrize("use_inductor", [True, False])
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
@torch.inference_mode()
@create_new_process_for_each_test("spawn")
-def test_simple_piecewise_compile(use_inductor):
+def test_simple_piecewise_compile(backend):
    _run_simple_model(
        splitting_ops=["silly::attention"],
        use_inductor_graph_partition=False,
-        use_inductor=use_inductor,
+        backend=backend,
        # 2 * num_layers + 1
        expected_num_piecewise_graphs_seen=5,
        # 1 + num_layers
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
    _run_simple_model(
        splitting_ops=["silly::attention"],
        use_inductor_graph_partition=True,
-        use_inductor=True,
+        backend="inductor",
        # Since not splitting at fx graph level
        expected_num_piecewise_graphs_seen=1,
        # Since not splitting at fx graph level
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -249,14 +249,13 @@ def test_compilation_config():
    args = parser.parse_args(
        [
            "-O",
-            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
            '"use_inductor": false}',
        ]
    )
    assert (
        args.compilation_config.mode == 3
        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and not args.compilation_config.use_inductor
+        and args.compilation_config.backend == "eager"
    )
    # set to string form of a dict
@ -264,13 +263,13 @@ def test_compilation_config():
        [
            "--compilation-config="
            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": true}',
+            '"backend": "inductor"}',
        ]
    )
    assert (
        args.compilation_config.mode == 3
        and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and args.compilation_config.use_inductor
+        and args.compilation_config.backend == "inductor"
    )
@ -278,8 +277,9 @@ def test_prefix_cache_default():
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
    args = parser.parse_args([])
    # should be None by default (depends on model).
    engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.enable_prefix_caching, "prefix caching should default to on."
+    assert engine_args.enable_prefix_caching is None
    # with flag to turn it on.
    args = parser.parse_args(["--enable-prefix-caching"])
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@ -2,6 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from openai.types.responses.response_function_tool_call_output_item import (
    ResponseFunctionToolCallOutputItem,
 )
 from openai.types.responses.response_reasoning_item import (
    Content,
    ResponseReasoningItem,
@ -76,6 +79,18 @@ class TestResponsesUtils:
            == 'Hmm, the user has just started with a simple "Hello,"'
        )
        tool_call_output = ResponseFunctionToolCallOutputItem(
            id="temp_id",
            type="function_call_output",
            call_id="temp",
            output="1234",
            status="completed",
        )
        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
        assert formatted_item["role"] == "tool"
        assert formatted_item["content"] == "1234"
        assert formatted_item["tool_call_id"] == "temp"
        item = ResponseReasoningItem(
            id="lol",
            summary=[],
--- a/tests/models/test_gguf_download.py
+++ b/tests/models/test_gguf_download.py
@ -0,0 +1,240 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from unittest.mock import MagicMock, patch
 import pytest
 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
 from vllm.model_executor.model_loader.weight_utils import download_gguf
 class TestGGUFDownload:
    """Test GGUF model downloading functionality."""
    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
    def test_download_gguf_single_file(self, mock_download):
        """Test downloading a single GGUF file."""
        # Setup mock
        mock_folder = "/tmp/mock_cache"
        mock_download.return_value = mock_folder
        # Mock glob to return a single file
        with patch("glob.glob") as mock_glob:
            mock_glob.side_effect = lambda pattern, **kwargs: (
                [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else []
            )
            result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
            # Verify download_weights_from_hf was called with correct patterns
            mock_download.assert_called_once_with(
                model_name_or_path="unsloth/Qwen3-0.6B-GGUF",
                cache_dir=None,
                allow_patterns=[
                    "*-IQ1_S.gguf",
                    "*-IQ1_S-*.gguf",
                    "*/*-IQ1_S.gguf",
                    "*/*-IQ1_S-*.gguf",
                ],
                revision=None,
                ignore_patterns=None,
            )
            # Verify result is the file path, not folder
            assert result == f"{mock_folder}/model-IQ1_S.gguf"
    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
    def test_download_gguf_sharded_files(self, mock_download):
        """Test downloading sharded GGUF files."""
        mock_folder = "/tmp/mock_cache"
        mock_download.return_value = mock_folder
        # Mock glob to return sharded files
        with patch("glob.glob") as mock_glob:
            mock_glob.side_effect = lambda pattern, **kwargs: (
                [
                    f"{mock_folder}/model-Q2_K-00001-of-00002.gguf",
                    f"{mock_folder}/model-Q2_K-00002-of-00002.gguf",
                ]
                if "Q2_K" in pattern
                else []
            )
            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
            # Should return the first file after sorting
            assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf"
    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
    def test_download_gguf_subdir(self, mock_download):
        """Test downloading GGUF files from subdirectory."""
        mock_folder = "/tmp/mock_cache"
        mock_download.return_value = mock_folder
        with patch("glob.glob") as mock_glob:
            mock_glob.side_effect = lambda pattern, **kwargs: (
                [f"{mock_folder}/Q2_K/model-Q2_K.gguf"]
                if "Q2_K" in pattern or "**/*.gguf" in pattern
                else []
            )
            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
            assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf"
    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
    @patch("glob.glob", return_value=[])
    def test_download_gguf_no_files_found(self, mock_glob, mock_download):
        """Test error when no GGUF files are found."""
        mock_folder = "/tmp/mock_cache"
        mock_download.return_value = mock_folder
        with pytest.raises(ValueError, match="Downloaded GGUF files not found"):
            download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
 class TestGGUFModelLoader:
    """Test GGUFModelLoader class methods."""
    @patch("os.path.isfile", return_value=True)
    def test_prepare_weights_local_file(self, mock_isfile):
        """Test _prepare_weights with local file."""
        load_config = LoadConfig(load_format="gguf")
        loader = GGUFModelLoader(load_config)
        # Create a simple mock ModelConfig with only the model attribute
        model_config = MagicMock()
        model_config.model = "/path/to/model.gguf"
        result = loader._prepare_weights(model_config)
        assert result == "/path/to/model.gguf"
        mock_isfile.assert_called_once_with("/path/to/model.gguf")
    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
    @patch("os.path.isfile", return_value=False)
    def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download):
        """Test _prepare_weights with HTTPS URL."""
        load_config = LoadConfig(load_format="gguf")
        loader = GGUFModelLoader(load_config)
        mock_hf_download.return_value = "/downloaded/model.gguf"
        # Create a simple mock ModelConfig with only the model attribute
        model_config = MagicMock()
        model_config.model = "https://huggingface.co/model.gguf"
        result = loader._prepare_weights(model_config)
        assert result == "/downloaded/model.gguf"
        mock_hf_download.assert_called_once_with(
            url="https://huggingface.co/model.gguf"
        )
    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
    @patch("os.path.isfile", return_value=False)
    def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download):
        """Test _prepare_weights with repo_id/filename.gguf format."""
        load_config = LoadConfig(load_format="gguf")
        loader = GGUFModelLoader(load_config)
        mock_hf_download.return_value = "/downloaded/model.gguf"
        # Create a simple mock ModelConfig with only the model attribute
        model_config = MagicMock()
        model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf"
        result = loader._prepare_weights(model_config)
        assert result == "/downloaded/model.gguf"
        mock_hf_download.assert_called_once_with(
            repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf"
        )
    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
    @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True)
    @patch("vllm.config.model.get_config")
    @patch("vllm.config.model.is_gguf", return_value=True)
    @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf")
    @patch("os.path.isfile", return_value=False)
    def test_prepare_weights_repo_quant_type(
        self,
        mock_isfile,
        mock_download_gguf,
        mock_is_gguf,
        mock_get_config,
        mock_file_exists,
        mock_get_image_config,
    ):
        """Test _prepare_weights with repo_id:quant_type format."""
        mock_hf_config = MagicMock()
        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
        class MockTextConfig:
            max_position_embeddings = 4096
            sliding_window = None
            model_type = "qwen3"
            num_attention_heads = 32
        mock_text_config = MockTextConfig()
        mock_hf_config.get_text_config.return_value = mock_text_config
        mock_hf_config.dtype = "bfloat16"
        mock_get_config.return_value = mock_hf_config
        load_config = LoadConfig(load_format="gguf")
        loader = GGUFModelLoader(load_config)
        mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf"
        model_config = ModelConfig(
            model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B"
        )
        result = loader._prepare_weights(model_config)
        # The actual result will be the downloaded file path from mock
        assert result == "/downloaded/model-IQ1_S.gguf"
        mock_download_gguf.assert_called_once_with(
            "unsloth/Qwen3-0.6B-GGUF",
            "IQ1_S",
            cache_dir=None,
            revision=None,
            ignore_patterns=["original/**/*"],
        )
    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
    @patch("vllm.config.model.get_config")
    @patch("vllm.config.model.is_gguf", return_value=False)
    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
    @patch("os.path.isfile", return_value=False)
    def test_prepare_weights_invalid_format(
        self,
        mock_isfile,
        mock_check_gguf,
        mock_is_gguf,
        mock_get_config,
        mock_get_image_config,
    ):
        """Test _prepare_weights with invalid format."""
        mock_hf_config = MagicMock()
        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
        class MockTextConfig:
            max_position_embeddings = 4096
            sliding_window = None
            model_type = "qwen3"
            num_attention_heads = 32
        mock_text_config = MockTextConfig()
        mock_hf_config.get_text_config.return_value = mock_text_config
        mock_hf_config.dtype = "bfloat16"
        mock_get_config.return_value = mock_hf_config
        load_config = LoadConfig(load_format="gguf")
        loader = GGUFModelLoader(load_config)
        # Create ModelConfig with a valid repo_id to avoid validation errors
        # Then test _prepare_weights with invalid format
        model_config = ModelConfig(model="unsloth/Qwen3-0.6B")
        # Manually set model to invalid format after creation
        model_config.model = "invalid-format"
        with pytest.raises(ValueError, match="Unrecognised GGUF reference"):
            loader._prepare_weights(model_config)
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@ -1,11 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 from unittest.mock import patch
 import pytest
 from vllm.transformers_utils.utils import (
    is_cloud_storage,
    is_gcs,
    is_gguf,
    is_remote_gguf,
    is_s3,
    split_remote_gguf,
 )
@ -28,3 +34,143 @@ def test_is_cloud_storage():
    assert is_cloud_storage("s3://model-path/path-to-model")
    assert not is_cloud_storage("/unix/local/path")
    assert not is_cloud_storage("nfs://nfs-fqdn.local")
 class TestIsRemoteGGUF:
    """Test is_remote_gguf utility function."""
    def test_is_remote_gguf_with_colon_and_slash(self):
        """Test is_remote_gguf with repo_id:quant_type format."""
        # Valid quant types
        assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
        assert is_remote_gguf("user/repo:Q2_K")
        assert is_remote_gguf("repo/model:Q4_K")
        assert is_remote_gguf("repo/model:Q8_0")
        # Invalid quant types should return False
        assert not is_remote_gguf("repo/model:quant")
        assert not is_remote_gguf("repo/model:INVALID")
        assert not is_remote_gguf("repo/model:invalid_type")
    def test_is_remote_gguf_without_colon(self):
        """Test is_remote_gguf without colon."""
        assert not is_remote_gguf("repo/model")
        assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF")
    def test_is_remote_gguf_without_slash(self):
        """Test is_remote_gguf without slash."""
        assert not is_remote_gguf("model.gguf")
        # Even with valid quant_type, no slash means not remote GGUF
        assert not is_remote_gguf("model:IQ1_S")
        assert not is_remote_gguf("model:quant")
    def test_is_remote_gguf_local_path(self):
        """Test is_remote_gguf with local file path."""
        assert not is_remote_gguf("/path/to/model.gguf")
        assert not is_remote_gguf("./model.gguf")
    def test_is_remote_gguf_with_path_object(self):
        """Test is_remote_gguf with Path object."""
        assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
        assert not is_remote_gguf(Path("repo/model"))
    def test_is_remote_gguf_with_http_https(self):
        """Test is_remote_gguf with HTTP/HTTPS URLs."""
        # HTTP/HTTPS URLs should return False even with valid quant_type
        assert not is_remote_gguf("http://example.com/repo/model:IQ1_S")
        assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K")
        assert not is_remote_gguf("http://repo/model:Q4_K")
        assert not is_remote_gguf("https://repo/model:Q8_0")
    def test_is_remote_gguf_with_cloud_storage(self):
        """Test is_remote_gguf with cloud storage paths."""
        # Cloud storage paths should return False even with valid quant_type
        assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S")
        assert not is_remote_gguf("gs://bucket/repo/model:Q2_K")
        assert not is_remote_gguf("s3://repo/model:Q4_K")
        assert not is_remote_gguf("gs://repo/model:Q8_0")
 class TestSplitRemoteGGUF:
    """Test split_remote_gguf utility function."""
    def test_split_remote_gguf_valid(self):
        """Test split_remote_gguf with valid repo_id:quant_type format."""
        repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
        assert quant_type == "IQ1_S"
        repo_id, quant_type = split_remote_gguf("repo/model:Q2_K")
        assert repo_id == "repo/model"
        assert quant_type == "Q2_K"
    def test_split_remote_gguf_with_path_object(self):
        """Test split_remote_gguf with Path object."""
        repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
        assert quant_type == "IQ1_S"
    def test_split_remote_gguf_invalid(self):
        """Test split_remote_gguf with invalid format."""
        # Invalid format (no colon) - is_remote_gguf returns False
        with pytest.raises(ValueError, match="Wrong GGUF model"):
            split_remote_gguf("repo/model")
        # Invalid quant type - is_remote_gguf returns False
        with pytest.raises(ValueError, match="Wrong GGUF model"):
            split_remote_gguf("repo/model:INVALID_TYPE")
        # HTTP URL - is_remote_gguf returns False
        with pytest.raises(ValueError, match="Wrong GGUF model"):
            split_remote_gguf("http://repo/model:IQ1_S")
        # Cloud storage - is_remote_gguf returns False
        with pytest.raises(ValueError, match="Wrong GGUF model"):
            split_remote_gguf("s3://bucket/repo/model:Q2_K")
 class TestIsGGUF:
    """Test is_gguf utility function."""
    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
    def test_is_gguf_with_local_file(self, mock_check_gguf):
        """Test is_gguf with local GGUF file."""
        assert is_gguf("/path/to/model.gguf")
        assert is_gguf("./model.gguf")
    def test_is_gguf_with_remote_gguf(self):
        """Test is_gguf with remote GGUF format."""
        # Valid remote GGUF format (repo_id:quant_type with valid quant_type)
        assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
        assert is_gguf("repo/model:Q2_K")
        assert is_gguf("repo/model:Q4_K")
        # Invalid quant_type should return False
        assert not is_gguf("repo/model:quant")
        assert not is_gguf("repo/model:INVALID")
    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
    def test_is_gguf_false(self, mock_check_gguf):
        """Test is_gguf returns False for non-GGUF models."""
        assert not is_gguf("unsloth/Qwen3-0.6B")
        assert not is_gguf("repo/model")
        assert not is_gguf("model")
    def test_is_gguf_edge_cases(self):
        """Test is_gguf with edge cases."""
        # Empty string
        assert not is_gguf("")
        # Only colon, no slash (even with valid quant_type)
        assert not is_gguf("model:IQ1_S")
        # Only slash, no colon
        assert not is_gguf("repo/model")
        # HTTP/HTTPS URLs
        assert not is_gguf("http://repo/model:IQ1_S")
        assert not is_gguf("https://repo/model:Q2_K")
        # Cloud storage
        assert not is_gguf("s3://bucket/repo/model:IQ1_S")
        assert not is_gguf("gs://bucket/repo/model:Q2_K")
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@ -166,7 +166,7 @@ def test_dict_args(parser):
        "--hf-overrides.key2.key4",
        "val3",
        # Test compile config and compilation mode
-        "-O.use_inductor=true",
+        "-O.use_inductor_graph_partition=true",
        "-O.backend",
        "custom",
        "-O1",
@ -219,7 +219,7 @@ def test_dict_args(parser):
    }
    assert parsed_args.compilation_config == {
        "mode": 1,
-        "use_inductor": True,
+        "use_inductor_graph_partition": True,
        "backend": "custom",
        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
    }
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -1248,7 +1248,9 @@ def test_allocate_with_lookahead():
    )
    # Test case 1: Requires additional lookahead tokens
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
    )
    blocks = kv_cache_manager.allocate_slots(
        request,
        num_new_tokens=3,
@ -1257,7 +1259,9 @@ def test_allocate_with_lookahead():
    assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
    # Test case 2: With precomputed blocks
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
    )
    # required_blocks = ceil((3 + 2) /4) = 2
    blocks = kv_cache_manager.allocate_slots(
        request,
@ -1268,7 +1272,9 @@ def test_allocate_with_lookahead():
    # Test case 3: With precomputed blocks
    # required_blocks = ceil((3 + 4) / 4) = 2
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
    )
    blocks = kv_cache_manager.allocate_slots(
        request,
        num_new_tokens=3,
@ -1495,7 +1501,8 @@ def test_get_kv_cache_config_one_worker():
            ),
        ],
    )
-    # different hidden size
+
    # different hidden size but same type, use UniformTypeKVCacheSpecs
    kv_cache_specs_hybrid = {
        "layer_1": new_kv_cache_spec(head_size=128),
        "layer_2": new_kv_cache_spec(head_size=64),
@ -1519,6 +1526,40 @@ def test_get_kv_cache_config_one_worker():
        ],
    )
    # Different hidden size and different type, align by different block size
    kv_cache_specs_hybrid = {
        "layer_1": new_kv_cache_spec(head_size=64),
        "layer_2": new_sliding_window_spec(head_size=32),
    }
    kv_cache_config_hybrid = get_kv_cache_configs(
        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 32]
    )[0]
    assert kv_cache_config_hybrid == KVCacheConfig(
        num_blocks=32,
        kv_cache_tensors=[
            KVCacheTensor(
                size=mem_per_block_per_layer * 32, shared_by=["layer_1", "layer_2"]
            ),
        ],
        kv_cache_groups=[
            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec(head_size=64)),
            KVCacheGroupSpec(
                ["layer_2"], new_sliding_window_spec(head_size=32, block_size=32)
            ),
        ],
    )
    # different hidden size that cannot be aligned by using different block size
    kv_cache_specs_hybrid = {
        "layer_1": new_kv_cache_spec(head_size=64),
        "layer_2": new_sliding_window_spec(head_size=96),
    }
    with pytest.raises(NotImplementedError):
        get_kv_cache_configs(
            vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
        )[0]
    # Test num_gpu_blocks_override
    vllm_config.cache_config.num_gpu_blocks_override = 16
    kv_cache_config_override_blocks = get_kv_cache_configs(
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@ -134,6 +134,7 @@ def test_prefill(hash_fn):
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # Complete 3 blocks (48 tokens)
@ -256,6 +257,7 @@ def test_prefill_hybrid_model():
        make_kv_cache_config_hybrid_model(block_size, 21),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    hash_fn = sha256
@ -416,6 +418,7 @@ def test_prefill_plp():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # the default hash function is sha256
    hash_fn = sha256
@ -523,6 +526,7 @@ def test_decode():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # Complete 3 blocks (48 tokens)
@ -585,6 +589,7 @@ def test_evict():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    last_token_id = 5 * 16 + 7
@ -643,6 +648,7 @@ def test_hash_block_correct_reuse():
        make_kv_cache_config(16, 2),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # Allocate 1 block and cache it.
@ -683,6 +689,7 @@ def test_computed_blocks_not_evicted():
        make_kv_cache_config(block_size, 3),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # Allocate a block and cache it.
@ -741,6 +748,7 @@ def test_basic_prefix_caching_disabled():
        make_kv_cache_config(block_size, 5),
        max_model_len=8192,
        enable_caching=False,
        hash_block_size=block_size,
    )
    req1 = make_request(
@ -790,6 +798,7 @@ def test_cache_blocks(hash_fn):
    block_pool = BlockPool(
        num_gpu_blocks=5,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # Req:
    #  Block 0: [0, 1, 2, 3]
@ -833,7 +842,9 @@ def test_cache_blocks_multi_group():
    This tests that blocks are cached correctly for different kv cache groups.
    """
    block_size = 4
-    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+    block_pool = BlockPool(
        num_gpu_blocks=10, enable_caching=True, hash_block_size=block_size
    )
    # Req:
    #  Block 0/4: [0, 1, 2, 3]
@ -921,6 +932,7 @@ def test_mm_prefix_caching():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@ -1020,6 +1032,7 @@ def test_cache_key_salting():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # 3 complete blocks and an incomplete block with 11 tokens.
@ -1101,6 +1114,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # Complete 3 blocks (48 tokens)
    # | Common-0 | Common-1 | Common-2 | ... |
@ -1173,6 +1187,7 @@ def test_reset_prefix_cache():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    full_block_token_ids = [i for i in range(3) for _ in range(16)]
@ -1213,6 +1228,7 @@ def test_prefix_cache_stats_disabled():
        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
        log_stats=False,  # Disable logging stats
    )
    assert manager.prefix_cache_stats is None
@ -1232,7 +1248,7 @@ def test_prefix_cache_stats_disabled():
 def test_maybe_evict_cached_block():
-    pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    pool = BlockPool(num_gpu_blocks=4, enable_caching=True, hash_block_size=16)
    block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000)
    block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000)
    block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000)
@ -1293,6 +1309,7 @@ def test_kv_cache_events(blocks_to_cache: int):
        max_model_len=8192,
        enable_caching=True,
        enable_kv_cache_events=True,
        hash_block_size=block_size,
    )
    num_tokens = block_size * blocks_to_cache
@ -1351,6 +1368,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int):
        max_model_len=8192,
        enable_caching=True,
        enable_kv_cache_events=True,
        hash_block_size=block_size,
    )
    # Test with LoRA request
@ -1405,6 +1423,7 @@ def test_eagle_enabled_removes_last_block():
        max_model_len=8192,
        enable_caching=True,
        use_eagle=True,
        hash_block_size=block_size,
    )
    # Request with 3 full blocks (48 tokens)
@ -1437,6 +1456,7 @@ def test_eagle_with_partial_blocks():
        max_model_len=8192,
        enable_caching=True,
        use_eagle=True,
        hash_block_size=block_size,
    )
    # 2 full blocks + 5 tokens (non-divisible length)
    token_ids = [0] * (2 * block_size + 5)
@ -1476,6 +1496,7 @@ def test_eagle_with_sliding_window():
        max_model_len=8192,
        enable_caching=True,
        use_eagle=True,
        hash_block_size=block_size,
    )
    # 2 full blocks + 5 tokens (non-divisible length)
@ -1522,6 +1543,76 @@ def test_eagle_with_sliding_window():
    assert num_tokens == 0
 def test_different_block_size():
    block_size = 16
    # full attention and sliding window attention layers have the same page size:
    # (32 tokens/block * float16 token, vs. 16 tokens/block * float32 token)
    kv_cache_config = KVCacheConfig(
        num_blocks=100,
        kv_cache_tensors=[],
        kv_cache_groups=[
            KVCacheGroupSpec(
                ["layer1"],
                FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
            ),
            KVCacheGroupSpec(
                ["layer2"],
                SlidingWindowSpec(
                    block_size,
                    1,
                    1,
                    torch.float32,
                    sliding_window=2 * block_size,
                ),
            ),
        ],
    )
    manager = KVCacheManager(
        kv_cache_config=kv_cache_config,
        max_model_len=8192,
        enable_caching=True,
        hash_block_size=block_size,
    )
    # 10 blocks of 16 tokens each. Token ids are not strictly aligned for each block.
    common_token_ids = [i for i in range(10) for _ in range(block_size)]
    req0 = make_request("0", common_token_ids, block_size, sha256)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
    assert not computed_blocks.blocks[0]
    assert not computed_blocks.blocks[1]
    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(
        req0, 7 * block_size, len(computed_blocks.blocks[0]) * 16, computed_blocks
    )
    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11])
    req1 = make_request("1", common_token_ids[: 7 * block_size + 1], block_size, sha256)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert len(computed_blocks.blocks[0]) == 3
    assert len(computed_blocks.blocks[1]) == 6
    assert num_computed_tokens == 6 * 16
    req2 = make_request("2", common_token_ids[: 6 * block_size + 1], block_size, sha256)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert len(computed_blocks.blocks[0]) == 3
    assert len(computed_blocks.blocks[1]) == 6
    assert num_computed_tokens == 6 * 16
    # Evict some blocks to make sliding window cache hit length 5*16
    # But should return 4 * 16 because full attention cache hit length must be
    # a multiple of 32
    manager.block_pool.cached_block_hash_to_block.pop(
        make_block_hash_with_group_id(req1.block_hashes[6], 1), 11
    )
    manager.block_pool.cached_block_hash_to_block.pop(
        make_block_hash_with_group_id(req1.block_hashes[5], 1), 10
    )
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert len(computed_blocks.blocks[0]) == 2
    assert len(computed_blocks.blocks[1]) == 4
    assert num_computed_tokens == 4 * 16
 def test_block_lookup_cache_single_block_per_key():
    cache = BlockHashToBlockMap()
    key0 = BlockHashWithGroupId(b"hash0")
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@ -41,7 +41,9 @@ def test_chunked_local_attention_possible_cached_prefix():
        attention_chunk_size=4,
    )
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
    )
    manager = get_chunked_local_attention_manager(
        chunked_local_attention_spec, block_pool
    )
@ -70,6 +72,7 @@ def test_chunked_local_attention_possible_cached_prefix():
            block_pool=block_pool,
            kv_cache_spec=chunked_local_attention_spec,
            use_eagle=False,
            alignment_tokens=block_size,
        )[0]
        assert len(computed_blocks) == expect_length
@ -111,7 +114,9 @@ def test_sliding_window_possible_cached_prefix():
        sliding_window=4,
    )
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
    )
    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
    def run_one_case(block_is_cached, expect_length):
@ -138,6 +143,7 @@ def test_sliding_window_possible_cached_prefix():
            block_pool=block_pool,
            kv_cache_spec=sliding_window_spec,
            use_eagle=False,
            alignment_tokens=block_size,
        )[0]
        assert len(computed_blocks) == expect_length
@ -178,7 +184,7 @@ def test_chunked_local_attention_remove_skipped_blocks():
        attention_chunk_size=4,
    )
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
    manager = get_chunked_local_attention_manager(attention_spec, block_pool)
@ -239,7 +245,7 @@ def test_sliding_window_remove_skipped_blocks():
        sliding_window=4,
    )
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
@ -316,7 +322,9 @@ def test_get_num_blocks_to_allocate():
        sliding_window=4,  # Placeholder value, not related to test result
    )
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
    )
    manager = get_sliding_window_manager(sliding_window_spec, block_pool)
    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
    cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
@ -341,7 +349,9 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
        attention_chunk_size=4,  # Placeholder value, not related to test result
    )
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
    )
    manager = get_chunked_local_attention_manager(attention_spec, block_pool)
    cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
    cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@ -87,6 +87,11 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
    # Set small draft model len to force doesn't-fit-in-drafter case.
    spec_config_short = spec_config | {"max_model_len": 50}
    test_sampling_params = [
        dict(),
        dict(logprobs=2),
    ]
    # test_preemption, executor, async_scheduling,
    # spec_config, test_prefill_chunking
    test_configs = [
@ -103,7 +108,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
        (True, "uni", True, spec_config_short, True),
    ]
-    run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
+    run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
@dynamo_config.patch(cache_size_limit=16)
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@ -11,6 +11,7 @@ import pprint
 import time
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
 from copy import deepcopy
 from functools import partial
 from typing import Any
@ -429,7 +430,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                self.vllm_backend.compiler_manager.compile(
                    submod,
                    args,
-                    self.compilation_config.inductor_compile_config,
+                    self.vllm_backend.inductor_config,
                    self.compilation_config,
                    graph_index=index,
                    num_graphs=len(self.compile_submod_names),
@ -531,6 +532,9 @@ class VllmBackend:
    sym_tensor_indices: list[int]
    input_buffers: list[torch.Tensor]
    compiler_manager: CompilerManager
    # Copy of CompilationConfig.inductor_compile_config +
    # an entry for PostGradPassManager
    inductor_config: dict[str, Any]
    def __init__(
        self,
@ -561,25 +565,30 @@ class VllmBackend:
            self.compilation_config
        )
        # Deepcopy the inductor config to detach the post-grad custom pass
        # from CompilationConfig.
        # We want to avoid PostGradPassManager in CompilationConfig because
        # in future we need PostGradPassManager.uuid() to be executed
        # only at compile time.
        self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
        # `torch.compile` is JIT compiled, so we don't need to
        # do anything here
    def configure_post_pass(self):
        config = self.compilation_config
        self.pass_manager.configure(self.vllm_config)
        # Post-grad custom passes are run using the post_grad_custom_post_pass
        # hook. If a pass for that hook exists, add it to the pass manager.
-        inductor_config = config.inductor_compile_config
+        if self.pass_key in self.inductor_config:
-        if self.pass_key in inductor_config:
+            if isinstance(self.inductor_config[self.pass_key], PostGradPassManager):
-            if isinstance(inductor_config[self.pass_key], PostGradPassManager):
+                raise ValueError(
-                # PassManager already added to config, make sure it's correct
+                    "PostGradPassManager can not be kept in CompilationConfig."
-                assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid()
+                )
            else:
                # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[self.pass_key], InductorPass)
+                assert isinstance(self.inductor_config[self.pass_key], InductorPass)
-                self.pass_manager.add(inductor_config[self.pass_key])
+                self.pass_manager.add(self.inductor_config[self.pass_key])
-        inductor_config[self.pass_key] = self.pass_manager
+        self.inductor_config[self.pass_key] = self.pass_manager
    def __call__(
        self, graph: fx.GraphModule, example_inputs
@ -638,9 +647,7 @@ class VllmBackend:
        self.compilation_config.local_cache_dir = local_cache_dir
        # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
-        disable_cache = not is_compile_cache_enabled(
+        disable_cache = not is_compile_cache_enabled(self.inductor_config)
            self.compilation_config.inductor_compile_config
        )
        if disable_cache:
            logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import inspect
 import os
 import pickle
@ -14,6 +13,7 @@ import vllm.envs as envs
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 try:
    from torch._dynamo.aot_compile import SerializableCallable
@ -160,7 +160,7 @@ def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str:
            # e.g. exec(). We can't actually check these.
            continue
        hash_content.append(content)
-    return hashlib.md5(
+    return safe_hash(
        "\n".join(hash_content).encode(), usedforsecurity=False
    ).hexdigest()
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
 import hashlib
 import os
 from collections.abc import Callable
 from contextlib import ExitStack
@ -16,6 +15,7 @@ import torch.fx as fx
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
 from vllm.utils.hashing import safe_hash
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@ -197,9 +197,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
    def compute_hash(self, vllm_config: VllmConfig) -> str:
        factors = get_inductor_factors()
-        hash_str = hashlib.md5(
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
-            str(factors).encode(), usedforsecurity=False
+            :10
-        ).hexdigest()[:10]
+        ]
        return hash_str
    def initialize_cache(
@ -286,9 +286,9 @@ class InductorAdaptor(CompilerInterface):
    def compute_hash(self, vllm_config: VllmConfig) -> str:
        factors = get_inductor_factors()
-        hash_str = hashlib.md5(
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
-            str(factors).encode(), usedforsecurity=False
+            :10
-        ).hexdigest()[:10]
+        ]
        return hash_str
    def initialize_cache(
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@ -107,7 +107,7 @@ class PiecewiseBackend:
            entry.runnable = self.vllm_backend.compiler_manager.compile(
                self.graph,
                args,
-                self.compilation_config.inductor_compile_config,
+                self.vllm_backend.inductor_config,
                self.compilation_config,
                graph_index=self.piecewise_compile_index,
                num_graphs=self.total_piecewise_compiles,
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@ -144,7 +144,7 @@ class CacheConfig:
    kv_offloading_backend: KVOffloadingBackend | None = None
    """The backend to use for KV cache offloading. Supported backends include
-    'native' (vLLM native CPU offloading), 'lmcache' This option must be used 
+    'native' (vLLM native CPU offloading), 'lmcache' This option must be used
    together with kv_offloading_size."""
    def compute_hash(self) -> str:
@ -167,8 +167,6 @@ class CacheConfig:
            "num_gpu_blocks_override",
            "enable_prefix_caching",
            "prefix_caching_hash_algo",
            # `cpu_offload_gb` does not use `torch.compile` yet.
            "cpu_offload_gb",
            "cpu_kvcache_space_bytes",
            "mamba_page_size_padded",
            # Post-init/derived counters
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -264,7 +264,6 @@ class CompilationConfig:
        - [`cudagraph_copy_inputs`]
        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
    - Inductor compilation:
        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
        - [`inductor_compile_config`]
        [vllm.config.CompilationConfig.inductor_compile_config]
@ -348,7 +347,7 @@ class CompilationConfig:
    - 'none,+op1,+op2' to enable only op1 and op2
    By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
    Inductor generates (fused) Triton kernels for disabled custom ops."""
    splitting_ops: list[str] | None = None
    """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@ -374,24 +373,6 @@ class CompilationConfig:
    Disabled by default until more models are supported/tested to work."""
    # Inductor capture
    use_inductor: bool | None = None
    """
    Whether to use inductor compilation.
    This flag is deprecated and will be removed in the next release 0.12.0.
    Please use the 'backend' option instead.
    - False: inductor compilation is not used. graph runs in eager
        (custom_ops enabled by default).
    - True: inductor compilation is used (custom_ops disabled by default).
        One graph for symbolic shape and one graph per size in compile_sizes
        are compiled using configurations in inductor_compile_config.
    This setting is ignored if mode<VLLM_COMPILE.
    For future compatibility:
    If use_inductor is True, backend="inductor" otherwise backend="eager".
    """
    compile_sizes: list[int | str] | None = None
    """Sizes to compile for inductor. In addition
    to integers, it also supports "cudagraph_capture_sizes" to
@ -759,14 +740,6 @@ class CompilationConfig:
                f"Invalid backend for piecewise compilation: {self.backend}"
            )
        if self.use_inductor is not None:
            logger.warning_once(
                "The 'use_inductor' flag is deprecated and will be "
                "removed in the next release (v0.12.0). "
                "Please use the 'backend' option instead.",
            )
            self.backend = "inductor" if self.use_inductor else "eager"
        if self.backend == "":
            self.backend = current_platform.get_compile_backend()
--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from dataclasses import field
 from typing import Any, Literal
@ -10,6 +9,7 @@ from pydantic import ConfigDict, SkipValidation
 from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
@ -45,7 +45,7 @@ class DeviceConfig:
        # the device/platform information will be summarized
        # by torch/vllm automatically.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self):
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import uuid
 from dataclasses import field
 from typing import Any, Literal, get_args
@ -9,6 +8,7 @@ from typing import Any, Literal, get_args
 from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 KVProducer = Literal["kv_producer", "kv_both"]
 KVConsumer = Literal["kv_consumer", "kv_both"]
@ -79,7 +79,7 @@ class KVTransferConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self) -> None:
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from typing import TYPE_CHECKING, Any
 from pydantic import Field, field_validator
@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 if TYPE_CHECKING:
    from vllm.model_executor.model_loader import LoadFormats
@ -104,7 +104,7 @@ class LoadConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    @field_validator("load_format", mode="after")
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from typing import TYPE_CHECKING, Any, Literal
 import torch
@ -11,6 +10,7 @@ from typing_extensions import Self
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 if TYPE_CHECKING:
    from vllm.config import ModelConfig
@ -74,7 +74,7 @@ class LoRAConfig:
        factors.append(self.fully_sharded_loras)
        factors.append(self.lora_dtype)
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    @model_validator(mode="after")
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -39,7 +39,12 @@ from vllm.transformers_utils.gguf_utils import (
    maybe_patch_hf_config_from_gguf,
 )
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect
+from vllm.transformers_utils.utils import (
    is_gguf,
    is_remote_gguf,
    maybe_model_redirect,
    split_remote_gguf,
 )
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
@ -294,9 +299,6 @@ class ModelConfig:
    pooler_config: PoolerConfig | None = None
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""
    override_pooler_config: dict | PoolerConfig | None = None
    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
    v0.12.0 or v1.0.0, whichever is sooner."""
    # Multimodal config and init vars
    multimodal_config: MultiModalConfig | None = None
@ -343,7 +345,6 @@ class ModelConfig:
            "logprobs_mode",
            "disable_cascade_attn",
            "skip_tokenizer_init",
            "enable_prompt_embeds",
            "served_model_name",
            "config_format",
            "hf_token",
@ -354,7 +355,6 @@ class ModelConfig:
            "logits_processors",
            "io_processor_plugin",
            "pooler_config",
            "override_pooler_config",
            "multimodal_config",
            "limit_mm_per_prompt",
            "media_io_kwargs",
@ -440,7 +440,8 @@ class ModelConfig:
        self.model = maybe_model_redirect(self.model)
        # The tokenizer is consistent with the model by default.
        if self.tokenizer is None:
-            if check_gguf_file(self.model):
+            # Check if this is a GGUF model (either local file or remote GGUF)
            if is_gguf(self.model):
                raise ValueError(
                    "Using a tokenizer is mandatory when loading a GGUF model. "
                    "Please specify the tokenizer path or name using the "
@ -642,18 +643,6 @@ class ModelConfig:
        # Init pooler config if needed
        if self.runner_type == "pooling":
            if self.override_pooler_config is not None:
                logger.warning_once(
                    "`override_pooler_config` is deprecated and will be "
                    "removed in v0.12.0 or v1.0.0, whichever is sooner. "
                    "Please use `pooler_config` instead."
                )
                if isinstance(self.override_pooler_config, dict):
                    self.pooler_config = PoolerConfig(**self.override_pooler_config)
                else:
                    self.pooler_config = self.override_pooler_config
            if self.pooler_config is None:
                self.pooler_config = PoolerConfig()
@ -832,7 +821,10 @@ class ModelConfig:
            self.tokenizer = object_storage_tokenizer.dir
    def _get_encoder_config(self):
-        return get_sentence_transformer_tokenizer_config(self.model, self.revision)
+        model = self.model
        if is_remote_gguf(model):
            model, _ = split_remote_gguf(model)
        return get_sentence_transformer_tokenizer_config(model, self.revision)
    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from collections.abc import Mapping
 from typing import TYPE_CHECKING, Any, Literal, TypeAlias
@ -9,6 +8,7 @@ from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 if TYPE_CHECKING:
    from vllm.attention.backends.registry import AttentionBackendEnum
@ -216,7 +216,7 @@ class MultiModalConfig:
            if self.mm_encoder_attn_backend is not None
            else None
        ]
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    def get_limit_per_prompt(self, modality: str) -> int:
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from functools import cached_property
 from typing import Any, Literal, cast
@ -11,6 +10,7 @@ from pydantic.dataclasses import dataclass
 from vllm import version
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 DetailedTraceModules = Literal["model", "worker", "all"]
@ -78,7 +78,7 @@ class ObservabilityConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    @field_validator("show_hidden_metrics_for_version")
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -593,9 +593,10 @@ class ParallelConfig:
                "max_parallel_loading_workers is currently "
                "not supported and will be ignored."
            )
-        if self.distributed_executor_backend != "mp" and self.nnodes > 1:
+        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
            raise ValueError(
-                "nnodes > 1 can only be set when distributed exectuor backend is mp."
+                "nnodes > 1 can only be set when distributed executor "
                "backend is mp or uni."
            )
    @property
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from typing import Any
 from pydantic.dataclasses import dataclass
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 logger = init_logger(__name__)
@ -102,7 +102,7 @@ class PoolerConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
@ -12,6 +11,7 @@ from typing_extensions import Self, deprecated
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import resolve_obj_by_qualname
 if TYPE_CHECKING:
@ -178,7 +178,7 @@ class SchedulerConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import hashlib
 from typing import TYPE_CHECKING, Any, Literal, get_args
 from pydantic import Field, SkipValidation, model_validator
@ -13,6 +12,7 @@ from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import LazyLoader, has_arctic_inference
 if TYPE_CHECKING:
@ -162,7 +162,7 @@ class SpeculativeConfig:
        # Eagle3 affects the computation graph because it returns intermediate
        # hidden states in addition to the final hidden state.
        factors.append(self.method == "eagle3")
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    @staticmethod
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 from typing import Any, Literal
 from pydantic import model_validator
@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
 StructuredOutputsBackend = Literal[
    "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
@ -58,7 +58,7 @@ class StructuredOutputsConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    @model_validator(mode="after")
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -3,7 +3,6 @@
 import copy
 import getpass
 import hashlib
 import json
 import os
 import tempfile
@ -25,6 +24,7 @@ from vllm.config.speculative import EagleModelTypes
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
 from vllm.utils.hashing import safe_hash
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
@ -193,7 +193,7 @@ class VllmConfig:
            vllm_factors.append("None")
        if self.additional_config:
            if isinstance(additional_config := self.additional_config, dict):
-                additional_config_hash = hashlib.md5(
+                additional_config_hash = safe_hash(
                    json.dumps(additional_config, sort_keys=True).encode(),
                    usedforsecurity=False,
                ).hexdigest()
@ -204,9 +204,9 @@ class VllmConfig:
            vllm_factors.append("None")
        factors.append(vllm_factors)
-        hash_str = hashlib.md5(
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
-            str(factors).encode(), usedforsecurity=False
+            :10
-        ).hexdigest()[:10]
+        ]
        return hash_str
    def pad_for_cudagraph(self, batch_size: int) -> int:
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import hashlib
 import os
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
@ -15,6 +14,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
    KVConnectorRole,
 )
 from vllm.logger import init_logger
 from vllm.utils.hashing import safe_hash
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
@ -423,7 +423,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
        if mm_hashes:
            mm_str = "-".join(mm_hashes)
            token_bytes += mm_str.encode("utf-8")
-        input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest()
+        input_ids_hash = safe_hash(token_bytes, usedforsecurity=False).hexdigest()
        foldername = os.path.join(self._storage_path, input_ids_hash)
        if create_folder:
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -51,6 +51,7 @@ from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.network_utils import get_distributed_init_method
 from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import (
    direct_register_custom_op,
    supports_custom_op,
@ -329,7 +330,8 @@ class GroupCoordinator:
            )
            # a group with `gloo` backend, to allow direct coordination between
            # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            with suppress_stdout():
                cpu_group = torch.distributed.new_group(ranks, backend="gloo")
            if self.rank in ranks:
                self.ranks = ranks
                self.world_size = len(ranks)
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@ -30,6 +30,7 @@ from torch.distributed.rendezvous import rendezvous
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_tcp_uri
 from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 logger = init_logger(__name__)
@ -427,33 +428,34 @@ def init_gloo_process_group(
    Stateless init ProcessGroup with gloo backend compatible with
    different torch versions.
    """
-    if is_torch_equal_or_newer("2.6"):
+    with suppress_stdout():
-        pg = ProcessGroup(
+        if is_torch_equal_or_newer("2.6"):
-            prefix_store,
+            pg = ProcessGroup(
-            group_rank,
+                prefix_store,
-            group_size,
+                group_rank,
-        )
+                group_size,
-    else:
+            )
-        options = ProcessGroup.Options(backend="gloo")
+        else:
-        pg = ProcessGroup(
+            options = ProcessGroup.Options(backend="gloo")
-            prefix_store,
+            pg = ProcessGroup(
-            group_rank,
+                prefix_store,
-            group_size,
+                group_rank,
-            options,
+                group_size,
-        )
+                options,
-    from torch.distributed.distributed_c10d import ProcessGroupGloo
+            )
        from torch.distributed.distributed_c10d import ProcessGroupGloo
-    backend_class = ProcessGroupGloo(
+        backend_class = ProcessGroupGloo(
-        prefix_store, group_rank, group_size, timeout=timeout
+            prefix_store, group_rank, group_size, timeout=timeout
-    )
+        )
-    backend_type = ProcessGroup.BackendType.GLOO
+        backend_type = ProcessGroup.BackendType.GLOO
-    device = torch.device("cpu")
+        device = torch.device("cpu")
-    if is_torch_equal_or_newer("2.6"):
+        if is_torch_equal_or_newer("2.6"):
-        # _set_default_backend is supported in torch >= 2.6
+            # _set_default_backend is supported in torch >= 2.6
-        pg._set_default_backend(backend_type)
+            pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
+        backend_class._set_sequence_number_for_group()
-    pg._register_backend(device, backend_type, backend_class)
+        pg._register_backend(device, backend_type, backend_class)
    return pg
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -29,7 +29,7 @@ import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
 from pydantic.fields import FieldInfo
-from typing_extensions import TypeIs, deprecated
+from typing_extensions import TypeIs
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
@ -86,7 +86,7 @@ from vllm.transformers_utils.config import (
    is_interleaved,
    maybe_override_with_speculators,
 )
-from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
+from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
@ -520,9 +520,6 @@ class EngineArgs:
    scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
    pooler_config: PoolerConfig | None = ModelConfig.pooler_config
    override_pooler_config: dict | PoolerConfig | None = (
        ModelConfig.override_pooler_config
    )
    compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
    worker_cls: str = ParallelConfig.worker_cls
    worker_extension_cls: str = ParallelConfig.worker_extension_cls
@ -659,11 +656,6 @@ class EngineArgs:
        )
        model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
        model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
        model_group.add_argument(
            "--override-pooler-config",
            **model_kwargs["override_pooler_config"],
            deprecated=True,
        )
        model_group.add_argument(
            "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
        )
@ -880,7 +872,11 @@ class EngineArgs:
            "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
        )
        cache_group.add_argument(
-            "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]
+            "--enable-prefix-caching",
            **{
                **cache_kwargs["enable_prefix_caching"],
                "default": None,
            },
        )
        cache_group.add_argument(
            "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
@ -1144,8 +1140,8 @@ class EngineArgs:
        return engine_args
    def create_model_config(self) -> ModelConfig:
-        # gguf file needs a specific model loader and doesn't use hf_repo
+        # gguf file needs a specific model loader
-        if check_gguf_file(self.model):
+        if is_gguf(self.model):
            self.quantization = self.load_format = "gguf"
        # NOTE(woosuk): In V1, we use separate processes for workers (unless
@ -1239,7 +1235,6 @@ class EngineArgs:
            mm_encoder_tp_mode=self.mm_encoder_tp_mode,
            mm_encoder_attn_backend=self.mm_encoder_attn_backend,
            pooler_config=self.pooler_config,
            override_pooler_config=self.override_pooler_config,
            logits_processor_pattern=self.logits_processor_pattern,
            generation_config=self.generation_config,
            override_generation_config=self.override_generation_config,
@ -1812,9 +1807,11 @@ class EngineArgs:
        if model_config.runner_type != "pooling":
            default_chunked_prefill = True
-            # Disable prefix caching default for hybrid models
+            # Disable prefix caching default for hybrid models and mamba-only
-            # since the feature is still experimental.
+            # models since the feature is still experimental.
-            default_prefix_caching = not model_config.is_hybrid
+            default_prefix_caching = not (
                model_config.is_hybrid or model_config.is_attention_free
            )
        else:
            assert model_config.pooler_config is not None
@ -2041,24 +2038,6 @@ class AsyncEngineArgs(EngineArgs):
    enable_log_requests: bool = False
    @property
    @deprecated(
        "`disable_log_requests` is deprecated and has been replaced with "
        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
        "`enable_log_requests` instead."
    )
    def disable_log_requests(self) -> bool:
        return not self.enable_log_requests
    @disable_log_requests.setter
    @deprecated(
        "`disable_log_requests` is deprecated and has been replaced with "
        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
        "`enable_log_requests` instead."
    )
    def disable_log_requests(self, value: bool):
        self.enable_log_requests = not value
    @staticmethod
    def add_cli_args(
        parser: FlexibleArgumentParser, async_args_only: bool = False
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -174,9 +174,6 @@ class LLM:
            For example, for Phi-3-Vision: `{"num_crops": 4}`.
        pooler_config: Initialize non-default pooling config for the pooling
            model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
        override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
            whichever is sooner.
        compilation_config: Either an integer or a dictionary. If it is an
            integer, it is used as the mode of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
@ -214,7 +211,6 @@ class LLM:
        hf_overrides: HfOverrides | None = None,
        mm_processor_kwargs: dict[str, Any] | None = None,
        pooler_config: PoolerConfig | None = None,
        override_pooler_config: PoolerConfig | None = None,
        structured_outputs_config: dict[str, Any]
        | StructuredOutputsConfig
        | None = None,
@ -330,7 +326,6 @@ class LLM:
            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
            pooler_config=pooler_config,
            override_pooler_config=override_pooler_config,
            structured_outputs_config=structured_outputs_instance,
            compilation_config=compilation_config_instance,
            logits_processors=logits_processors,
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -29,7 +29,6 @@ from openai.types.responses import (
    ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent,
    ResponsePrompt,
    ResponseReasoningItem,
    ResponseReasoningTextDeltaEvent,
    ResponseReasoningTextDoneEvent,
    ResponseStatus,
@ -304,9 +303,7 @@ def get_logits_processors(
    return None
-ResponseInputOutputItem: TypeAlias = (
+ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
    ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
 )
 class ResponsesRequest(OpenAIBaseModel):
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -10,6 +10,9 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
    Function as FunctionCallTool,
 )
 from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
 from openai.types.responses.response_function_tool_call_output_item import (
    ResponseFunctionToolCallOutputItem,
 )
 from openai.types.responses.response_output_message import ResponseOutputMessage
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
@ -94,6 +97,12 @@ def construct_chat_message_with_tool_call(
            "role": "assistant",
            "reasoning": reasoning_content,
        }
    elif isinstance(item, ResponseFunctionToolCallOutputItem):
        return ChatCompletionToolMessageParam(
            role="tool",
            content=item.output,
            tool_call_id=item.call_id,
        )
    elif item.get("type") == "function_call_output":
        # Append the function call output as a tool message.
        return ChatCompletionToolMessageParam(
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@ -12,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
    TopKWeightAndReduceNoOP,
 )
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.import_utils import has_triton_kernels
@ -88,14 +90,17 @@ def triton_kernel_moe_forward(
        gating_output, topk, sm_first=not renormalize
    )
    output = torch.empty_like(hidden_states)
    return triton_kernel_fused_experts(
-        None,
+        output,
        hidden_states,
        w1,
        w2,
        routing_data,
        gather_idx,
        scatter_idx,
        topk=topk,
        activation=activation,
        quant_config=quant_config,
        apply_router_weight_on_input=apply_router_weight_on_input,
@ -113,6 +118,7 @@ def triton_kernel_fused_experts(
    routing_data,  # RoutingData
    gather_indx,  # GatherIndx
    scatter_indx,  # ScatterIndx
    topk: int,
    activation: str = "silu",
    quant_config: FusedMoEQuantConfig | None = None,
    swiglu_alpha: float = 1.702,
@ -120,6 +126,7 @@ def triton_kernel_fused_experts(
    apply_router_weight_on_input: bool = False,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    intermediate_cache: torch.Tensor | None = None,
    a1q_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
    if quant_config is None:
@ -131,14 +138,30 @@ def triton_kernel_fused_experts(
    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
    # Shape check, only check non-mxfp4
    assert hidden_states.ndim == 2
    assert hidden_states.shape[-1] == w1.shape[-2]
    assert w2.shape[-1] == w1.shape[1]
    batch_dim = 1
    M, K = hidden_states.shape[-2:]
    E, _, N = w1.shape
    if global_num_experts == -1:
        global_num_experts = E
    if intermediate_cache is None:
        intermediate_cache = torch.empty(
            (batch_dim, M * topk, N // 2),
            device=hidden_states.device,
            dtype=hidden_states.dtype,
        )
    # Add batch_dim to output buffer because matmul_ogs expects 3D output
    intermediate_cache = _resize_cache(
        intermediate_cache, (batch_dim, M * topk, N // 2)
    )
    output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
    act = FusedActivation(
        FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
        (swiglu_alpha, swiglu_limit),
@ -146,7 +169,7 @@ def triton_kernel_fused_experts(
    )
    gammas = routing_data.gate_scal if routing_data else None
-    intermediate_cache1 = matmul_ogs(
+    matmul_ogs(
        hidden_states,
        w1,
        quant_config.w1_bias,
@ -155,10 +178,11 @@ def triton_kernel_fused_experts(
        precision_config=quant_config.w1_precision,
        gammas=gammas if apply_router_weight_on_input else None,
        fused_activation=act,
        y=intermediate_cache,
    )
-    intermediate_cache3 = matmul_ogs(
+    matmul_ogs(
-        intermediate_cache1,
+        intermediate_cache.view(M * topk, N // 2),
        w2,
        quant_config.w2_bias,
        routing_data,
@ -167,7 +191,8 @@ def triton_kernel_fused_experts(
        gammas=None if apply_router_weight_on_input else gammas,
        y=output_tensor,
    )
-    return intermediate_cache3
+    output_tensor = output_tensor.view(M, K)
    return output_tensor
 def make_routing_data(
@ -221,6 +246,42 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
    def supports_expert_map(self) -> bool:
        return True
    def moe_problem_size(
        self,
        a1: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_ids: torch.Tensor,
    ) -> tuple[int, int, int, int, int]:
        """
        Extract the MoE problem size from the given tensor arguments:
        - a: The hidden states, input to the MoE layer.
        - w1: The first set of expert weights.
        - w2: The second set of expert weights.
        - topk_ids: The topk ids.
        Note: extracting the problem shape from the weight and activation
        tensors is not obvious.  It needs to be done this way specifically
        due to subtle issues with particular kernels, e.g. the int4 kernels
        divide the trailing dimension by two, so it's not "correct" to
        extract N or K from the trailing dimension of w1 or w2.  Similarly,
        some kernels transpose the weights, so this needs to be kept in mind.
        Note: This implementation covers most cases. However, if experts
        require a specialized implementation, like MarlinExperts, they are free
        to override this function.
        """
        assert w1.dim() == 3 and w2.dim() == 3
        E, _, N = w1.size()
        K = a1.size(-1)
        assert a1.dim() == 2
        assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
        M = a1.size(0)
        assert topk_ids.dim() == 2
        topk = topk_ids.size(1)
        return E, M, N, K, topk
    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Weight application and reduction happens in the fused_experts kernel.
        return TopKWeightAndReduceNoOP()
@ -263,8 +324,8 @@ class OAITritonExperts(BaseOAITritonExperts):
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        # workspace are allocated inside the kernel
-        workspace1 = (M, K)
+        workspace1 = (0, 0)
-        workspace2 = (0, 0)
+        workspace2 = (M * topk, N // 2)
        output = (M, K)
        return (workspace1, workspace2, output)
@ -297,20 +358,21 @@ class OAITritonExperts(BaseOAITritonExperts):
            topk_ids, topk_weights, local_num_experts
        )
-        experts_output = triton_kernel_fused_experts(
+        topk = topk_ids.size(1)
-            None,
+        triton_kernel_fused_experts(
            output,
            hidden_states,
            w1,
            w2,
            routing_data,
            gather_indx,
            scatter_indx,
            topk=topk,
            activation=activation,
            quant_config=self.quant_config,
            apply_router_weight_on_input=False,
            global_num_experts=local_num_experts,
            expert_map=None,  # applied already
            intermediate_cache=workspace2,
            a1q_scale=a1q_scale,
        )
        output.copy_(experts_output, non_blocking=True)
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@ -103,7 +103,7 @@ __all__ = [
    "CompressedTensorsW8A8Int8MoEMethod",
    "CompressedTensorsWNA16MarlinMoEMethod",
    "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4MoeMethod",
+    "CompressedTensorsW4A4Nvfp4MoeMethod",
    "CompressedTensorsW4A8Int8MoEMethod",
 ]
@ -171,7 +171,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                    quant_config, layer.moe_config
                )
        elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4MoeMethod(layer.moe_config)
+            return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
        elif (
            quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
            or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
@ -188,7 +188,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
            )
-class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
+class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
    def __init__(self, moe: FusedMoEConfig):
        from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
            detect_nvfp4_moe_support,
@ -205,8 +205,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
            logger.info_once(
                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
-                " for CompressedTensorsW4A4MoeMethod."
+                " for CompressedTensorsW4A4Nvfp4MoeMethod."
            )
        elif self.use_marlin:
            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
        else:
            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
    def create_weights(
        self,
@ -612,7 +616,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
            assert expert_map is None, (
                "Expert Parallelism / expert_map "
                "is currently not supported for "
-                "CompressedTensorsW4A4MoeMethod."
+                "CompressedTensorsW4A4Nvfp4MoeMethod."
            )
            assert self.moe_quant_config is not None
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@ -1132,6 +1132,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
                " for ModelOptNvFp4FusedMoE."
            )
        elif self.use_marlin:
            logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
        else:
            logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
    def maybe_make_prepare_finalize(
        self,
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@ -196,9 +196,10 @@ class Mxfp4Config(QuantizationConfig):
            # TODO: Add support for MXFP4 Linear Method.
            # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
            # if you are interested in enabling MXFP4 here.
-            logger.warning_once(
+            logger.debug_once(
                "MXFP4 linear layer is not implemented - falling back to "
-                "UnquantizedLinearMethod."
+                "UnquantizedLinearMethod.",
                scope="local",
            )
            return UnquantizedLinearMethod()
        elif isinstance(layer, FusedMoE):
@ -208,9 +209,10 @@ class Mxfp4Config(QuantizationConfig):
                return Mxfp4MoEMethod(layer.moe_config)
        elif isinstance(layer, Attention):
            # TODO: Add support for MXFP4 Attention.
-            logger.warning_once(
+            logger.debug_once(
                "MXFP4 attention layer is not implemented. "
-                "Skipping quantization for this layer."
+                "Skipping quantization for this layer.",
                scope="local",
            )
        return None
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@ -18,6 +18,7 @@ from vllm.model_executor.model_loader.utils import (
    process_weights_after_loading,
 )
 from vllm.model_executor.model_loader.weight_utils import (
    download_gguf,
    get_gguf_extra_tensor_names,
    get_gguf_weight_type_map,
    gguf_quant_weights_iterator,
@ -43,7 +44,8 @@ class GGUFModelLoader(BaseModelLoader):
                f"load format {load_config.load_format}"
            )
-    def _prepare_weights(self, model_name_or_path: str):
+    def _prepare_weights(self, model_config: ModelConfig):
        model_name_or_path = model_config.model
        if os.path.isfile(model_name_or_path):
            return model_name_or_path
        # for raw HTTPS link
@ -55,12 +57,23 @@ class GGUFModelLoader(BaseModelLoader):
        if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
            repo_id, filename = model_name_or_path.rsplit("/", 1)
            return hf_hub_download(repo_id=repo_id, filename=filename)
-        else:
+        # repo_id:quant_type
-            raise ValueError(
+        elif "/" in model_name_or_path and ":" in model_name_or_path:
-                f"Unrecognised GGUF reference: {model_name_or_path} "
+            repo_id, quant_type = model_name_or_path.rsplit(":", 1)
-                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
+            return download_gguf(
                repo_id,
                quant_type,
                cache_dir=self.load_config.download_dir,
                revision=model_config.revision,
                ignore_patterns=self.load_config.ignore_patterns,
            )
        raise ValueError(
            f"Unrecognised GGUF reference: {model_name_or_path} "
            "(expected local file, raw URL, <repo_id>/<filename>.gguf, "
            "or <repo_id>:<quant_type>)"
        )
    def _get_gguf_weights_map(self, model_config: ModelConfig):
        """
        GGUF uses this naming convention for their tensors from HF checkpoint:
@ -244,7 +257,7 @@ class GGUFModelLoader(BaseModelLoader):
        gguf_to_hf_name_map: dict[str, str],
    ) -> dict[str, str]:
        weight_type_map = get_gguf_weight_type_map(
-            model_config.model, gguf_to_hf_name_map
+            model_name_or_path, gguf_to_hf_name_map
        )
        is_multimodal = hasattr(model_config.hf_config, "vision_config")
        if is_multimodal:
@ -290,10 +303,10 @@ class GGUFModelLoader(BaseModelLoader):
        yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
    def download_model(self, model_config: ModelConfig) -> None:
-        self._prepare_weights(model_config.model)
+        self._prepare_weights(model_config)
    def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
        gguf_weights_map = self._get_gguf_weights_map(model_config)
        model.load_weights(
            self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
@ -303,7 +316,7 @@ class GGUFModelLoader(BaseModelLoader):
        self, vllm_config: VllmConfig, model_config: ModelConfig
    ) -> nn.Module:
        device_config = vllm_config.device_config
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
        gguf_weights_map = self._get_gguf_weights_map(model_config)
        # we can only know if tie word embeddings after mapping weights
        if "lm_head.weight" in get_gguf_extra_tensor_names(
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -369,6 +369,52 @@ def get_sparse_attention_config(
    return config
 def download_gguf(
    repo_id: str,
    quant_type: str,
    cache_dir: str | None = None,
    revision: str | None = None,
    ignore_patterns: str | list[str] | None = None,
 ) -> str:
    # Use patterns that snapshot_download can handle directly
    # Patterns to match:
    # - *-{quant_type}.gguf (root)
    # - *-{quant_type}-*.gguf (root sharded)
    # - */*-{quant_type}.gguf (subdir)
    # - */*-{quant_type}-*.gguf (subdir sharded)
    allow_patterns = [
        f"*-{quant_type}.gguf",
        f"*-{quant_type}-*.gguf",
        f"*/*-{quant_type}.gguf",
        f"*/*-{quant_type}-*.gguf",
    ]
    # Use download_weights_from_hf which handles caching and downloading
    folder = download_weights_from_hf(
        model_name_or_path=repo_id,
        cache_dir=cache_dir,
        allow_patterns=allow_patterns,
        revision=revision,
        ignore_patterns=ignore_patterns,
    )
    # Find the downloaded file(s) in the folder
    local_files = []
    for pattern in allow_patterns:
        # Convert pattern to glob pattern for local filesystem
        glob_pattern = os.path.join(folder, pattern)
        local_files.extend(glob.glob(glob_pattern))
    if not local_files:
        raise ValueError(
            f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}"
        )
    # Sort to ensure consistent ordering (prefer non-sharded files)
    local_files.sort(key=lambda x: (x.count("-"), x))
    return local_files[0]
 def download_weights_from_hf(
    model_name_or_path: str,
    cache_dir: str | None,
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@ -233,7 +233,7 @@ class BaiChuanDecoderLayer(nn.Module):
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            position_embedding=position_embedding,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            max_position_embeddings=max_position_embeddings,
            cache_config=cache_config,
            quant_config=quant_config,
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@ -289,9 +289,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
        model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = model_config.max_model_len
        if cache_config.enable_prefix_caching:
            if model_config.supports_mamba_prefix_caching:
                logger.info(
@ -299,6 +296,11 @@ class MambaModelConfig(VerifyAndUpdateConfig):
                    "Its support for Mamba layers is experimental. "
                    "Please report any issues you may observe."
                )
                # By default, mamba block size will be set to max_model_len (see
                # below). When enabling prefix caching, we align mamba block size
                # to the block size as the basic granularity for prefix caching.
                if cache_config.mamba_block_size is None:
                    cache_config.mamba_block_size = cache_config.block_size
            else:
                logger.info(
                    "Hybrid or mamba-based model detected without "
@ -306,6 +308,9 @@ class MambaModelConfig(VerifyAndUpdateConfig):
                )
                cache_config.enable_prefix_caching = False
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = model_config.max_model_len
        # TODO(tdoublep): remove once cascade attention is supported
        logger.info(
            "Disabling cascade attention since it is not supported for hybrid models."
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@ -100,7 +100,7 @@ class GPTJAttention(nn.Module):
            self.head_size,
            rotary_dim=config.rotary_dim,
            max_position=max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            is_neox_style=False,
        )
        self.attn = Attention(
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@ -239,7 +239,7 @@ class Grok1DecoderLayer(nn.Module):
            num_heads=config.num_attention_heads,
            max_position=config.max_position_embeddings,
            num_kv_heads=config.num_key_value_heads,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@ -262,7 +262,7 @@ class LlamaAttention(nn.Module):
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=self.max_position_embeddings,
-            rope_parameters=config.rope_parameters,
+            rope_parameters=getattr(config, "rope_parameters", None),
            is_neox_style=is_neox_style,
            partial_rotary_factor=self.partial_rotary_factor,
        )
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -5,7 +5,6 @@ Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
 """
 import hashlib
 import importlib
 import json
 import os
@ -32,6 +31,7 @@ from vllm.config import (
 from vllm.logger import init_logger
 from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
 from vllm.utils.hashing import safe_hash
 from .interfaces import (
    has_inner_state,
@ -655,7 +655,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
        if model_path.exists():
            with open(model_path, "rb") as f:
-                module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
+                module_hash = safe_hash(f.read(), usedforsecurity=False).hexdigest()
            mi = self._load_modelinfo_from_cache(module_hash)
            if mi is not None:
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -407,9 +407,6 @@ class CudaPlatformBase(Platform):
        # We have found some valid backends. Select the one with the
        # highest priority.
        logger.info(
            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
        )
        sorted_indices = sorted(
            range(len(valid_backends_priorities)),
            key=lambda i: valid_backends_priorities[i][1],
@ -417,8 +414,9 @@ class CudaPlatformBase(Platform):
        selected_index = sorted_indices[0]
        selected_backend = valid_backends_priorities[selected_index][0]
        logger.info(
-            "Using %s backend.",
+            "Using %s attention backend out of potential backends: %s",
            selected_backend.name,
            [b[0].name for b in valid_backends_priorities],
        )
        return selected_backend.get_path()
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -42,7 +42,10 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.config_parser_base import ConfigParserBase
 from vllm.transformers_utils.utils import (
    check_gguf_file,
    is_gguf,
    is_remote_gguf,
    parse_safetensors_file_metadata,
    split_remote_gguf,
 )
 if envs.VLLM_USE_MODELSCOPE:
@ -453,51 +456,55 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
 def patch_rope_parameters(config: PretrainedConfig) -> None:
    """Provide backwards compatibility for RoPE."""
-    # Retrieve rope_parameters differently based on Transformers version
+    # Patch rope_parameters differently based on Transformers version
    if Version(version("transformers")) >= Version("5.0.0.dev0"):
-        from transformers.modeling_rope_utils import RopeParameters
+        from transformers.modeling_rope_utils import (
-
+            rope_config_validation,
-        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
+            standardize_rope_params,
            config, "rope_parameters", None
        )
-    elif hasattr(config, "rope_parameters"):
+
-        # We are in Transformers v4 and rope_parameters
+        # When Transformers v5 is installed, legacy rope_theta may be present
-        # has already been patched for this config
+        # when using custom code models written for Transformers v4
-        return
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
            standardize_rope_params(config, rope_theta=rope_theta)
            rope_config_validation(config)
            # Delete rope_theta to avoid confusion in downstream code
            del config.rope_theta
    else:
-        # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
+        # When Transformers v4 is installed, legacy rope_scaling may be present
-        rope_theta: float | None = getattr(config, "rope_theta", None)
+        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
-        rope_scaling: dict | None = getattr(config, "rope_scaling", None)
+            config.rope_parameters = rope_scaling
-        rope_parameters = rope_scaling
+        # When Transformers v4 is installed, legacy rope_theta may be present
-        # Move rope_theta into rope_parameters
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
-        if rope_theta is not None:
+            if not hasattr(config, "rope_parameters"):
-            rope_parameters = rope_parameters or {"rope_type": "default"}
+                config.rope_parameters = {"rope_type": "default"}
-            rope_parameters["rope_theta"] = rope_theta
+            config.rope_parameters["rope_theta"] = rope_theta
        # Add original_max_position_embeddings if present
        if rope_parameters and (
            ompe := getattr(config, "original_max_position_embeddings", None)
        ):
            rope_parameters["original_max_position_embeddings"] = ompe
        # Write back to config
        config.rope_parameters = rope_parameters
    # No RoPE parameters to patch
-    if rope_parameters is None:
+    if not hasattr(config, "rope_parameters"):
        return
    # Add original_max_position_embeddings if present
    if ompe := getattr(config, "original_max_position_embeddings", None):
        config.rope_parameters["original_max_position_embeddings"] = ompe
    # Handle nested rope_parameters in interleaved sliding attention models
-    if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+    if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
-        for rope_parameters_layer_type in rope_parameters.values():
+        for rope_parameters_layer_type in config.rope_parameters.values():
            patch_rope_parameters_dict(rope_parameters_layer_type)
    else:
-        patch_rope_parameters_dict(rope_parameters)
+        patch_rope_parameters_dict(config.rope_parameters)
 def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
    if "rope_type" in rope_parameters and "type" in rope_parameters:
        rope_type = rope_parameters["rope_type"]
        rope_type_legacy = rope_parameters["type"]
-        if rope_type != rope_type_legacy:
+        if (rope_type_legacy == "su" and rope_type == "longrope") or (
            rope_type_legacy == "mrope" and rope_type == "default"
        ):
            pass  # No action needed
        elif rope_type != rope_type_legacy:
            raise ValueError(
                f"Found conflicts between 'rope_type={rope_type}' (modern "
                f"field) and 'type={rope_type_legacy}' (legacy field). "
@ -629,10 +636,12 @@ def maybe_override_with_speculators(
    Returns:
        Tuple of (resolved_model, resolved_tokenizer, speculative_config)
    """
-    is_gguf = check_gguf_file(model)
+    if check_gguf_file(model):
    if is_gguf:
        kwargs["gguf_file"] = Path(model).name
        gguf_model_repo = Path(model).parent
    elif is_remote_gguf(model):
        repo_id, _ = split_remote_gguf(model)
        gguf_model_repo = Path(repo_id)
    else:
        gguf_model_repo = None
    kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
@ -678,10 +687,18 @@ def get_config(
 ) -> PretrainedConfig:
    # Separate model folder from file path for GGUF models
-    is_gguf = check_gguf_file(model)
+    _is_gguf = is_gguf(model)
-    if is_gguf:
+    _is_remote_gguf = is_remote_gguf(model)
-        kwargs["gguf_file"] = Path(model).name
+    if _is_gguf:
-        model = Path(model).parent
+        if check_gguf_file(model):
            # Local GGUF file
            kwargs["gguf_file"] = Path(model).name
            model = Path(model).parent
        elif _is_remote_gguf:
            # Remote GGUF - extract repo_id from repo_id:quant_type format
            # The actual GGUF file will be downloaded later by GGUFModelLoader
            # Keep model as repo_id:quant_type for download, but use repo_id for config
            model, _ = split_remote_gguf(model)
    if config_format == "auto":
        try:
@ -689,10 +706,25 @@ def get_config(
            # Transformers implementation.
            if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                config_format = "mistral"
-            elif is_gguf or file_or_path_exists(
+            elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists(
                model, HF_CONFIG_NAME, revision=revision
            ):
                config_format = "hf"
            # Remote GGUF models must have config.json in repo,
            # otherwise the config can't be parsed correctly.
            # FIXME(Isotr0py): Support remote GGUF repos without config.json
            elif _is_remote_gguf and not file_or_path_exists(
                model, HF_CONFIG_NAME, revision=revision
            ):
                err_msg = (
                    "Could not find config.json for remote GGUF model repo. "
                    "To load remote GGUF model through `<repo_id>:<quant_type>`, "
                    "ensure your model has config.json (HF format) file. "
                    "Otherwise please specify --hf-config-path <original_repo> "
                    "in engine args to fetch config from unquantized hf model."
                )
                logger.error(err_msg)
                raise ValueError(err_msg)
            else:
                raise ValueError(
                    "Could not detect config format for no config file found. "
@ -713,9 +745,6 @@ def get_config(
                "'config.json'.\n"
                "   - For Mistral models: ensure the presence of a "
                "'params.json'.\n"
                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
                "   Loading GGUF from a remote repo directly is not yet "
                "supported.\n"
            ).format(model=model)
            raise ValueError(error_message) from e
@ -729,7 +758,7 @@ def get_config(
        **kwargs,
    )
    # Special architecture mapping check for GGUF models
-    if is_gguf:
+    if _is_gguf:
        if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
            raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
@ -889,6 +918,8 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None
        A dictionary containing the pooling type and whether
            normalization is used, or None if no pooling configuration is found.
    """
    if is_remote_gguf(model):
        model, _ = split_remote_gguf(model)
    modules_file_name = "modules.json"
@ -1108,6 +1139,8 @@ def get_hf_image_processor_config(
    # Separate model folder from file path for GGUF models
    if check_gguf_file(model):
        model = Path(model).parent
    elif is_remote_gguf(model):
        model, _ = split_remote_gguf(model)
    return get_image_processor_config(
        model, token=hf_token, revision=revision, **kwargs
    )
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@ -18,7 +18,7 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
-from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path
+from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 if TYPE_CHECKING:
@ -236,8 +236,8 @@ def cached_processor_from_config(
    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
    **kwargs: Any,
 ) -> _P:
-    if check_gguf_file(model_config.model):
+    if is_gguf(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+        assert not is_gguf(model_config.tokenizer), (
            "For multimodal GGUF models, the original tokenizer "
            "should be used to correctly load processor."
        )
@ -350,8 +350,8 @@ def cached_image_processor_from_config(
    model_config: "ModelConfig",
    **kwargs: Any,
 ):
-    if check_gguf_file(model_config.model):
+    if is_gguf(model_config.model):
-        assert not check_gguf_file(model_config.tokenizer), (
+        assert not is_gguf(model_config.tokenizer), (
            "For multimodal GGUF models, the original tokenizer "
            "should be used to correctly load image processor."
        )
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -20,7 +20,12 @@ from vllm.transformers_utils.config import (
    list_filtered_repo_files,
 )
 from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import check_gguf_file
+from vllm.transformers_utils.utils import (
    check_gguf_file,
    is_gguf,
    is_remote_gguf,
    split_remote_gguf,
 )
 if TYPE_CHECKING:
    from vllm.config import ModelConfig
@ -180,10 +185,12 @@ def get_tokenizer(
        kwargs["truncation_side"] = "left"
    # Separate model folder from file path for GGUF models
-    is_gguf = check_gguf_file(tokenizer_name)
+    if is_gguf(tokenizer_name):
-    if is_gguf:
+        if check_gguf_file(tokenizer_name):
-        kwargs["gguf_file"] = Path(tokenizer_name).name
+            kwargs["gguf_file"] = Path(tokenizer_name).name
-        tokenizer_name = Path(tokenizer_name).parent
+            tokenizer_name = Path(tokenizer_name).parent
        elif is_remote_gguf(tokenizer_name):
            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
    # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
    # first to use official Mistral tokenizer if possible.
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@ -9,6 +9,8 @@ from os import PathLike
 from pathlib import Path
 from typing import Any
 from gguf import GGMLQuantizationType
 import vllm.envs as envs
 from vllm.logger import init_logger
@ -46,6 +48,57 @@ def check_gguf_file(model: str | PathLike) -> bool:
        return False
@cache
 def is_remote_gguf(model: str | Path) -> bool:
    """Check if the model is a remote GGUF model."""
    model = str(model)
    return (
        (not is_cloud_storage(model))
        and (not model.startswith(("http://", "https://")))
        and ("/" in model and ":" in model)
        and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
    )
 def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
    """Check if the quant type is a valid GGUF quant type."""
    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
 def split_remote_gguf(model: str | Path) -> tuple[str, str]:
    """Split the model into repo_id and quant type."""
    model = str(model)
    if is_remote_gguf(model):
        parts = model.rsplit(":", 1)
        return (parts[0], parts[1])
    raise ValueError(
        "Wrong GGUF model or invalid GGUF quant type: %s.\n"
        "- It should be in repo_id:quant_type format.\n"
        "- Valid GGMLQuantizationType values: %s",
        model,
        GGMLQuantizationType._member_names_,
    )
 def is_gguf(model: str | Path) -> bool:
    """Check if the model is a GGUF model.
    Args:
        model: Model name, path, or Path object to check.
    Returns:
        True if the model is a GGUF model, False otherwise.
    """
    model = str(model)
    # Check if it's a local GGUF file
    if check_gguf_file(model):
        return True
    # Check if it's a remote GGUF model (repo_id:quant_type format)
    return is_remote_gguf(model)
 def modelscope_list_repo_files(
    repo_id: str,
    revision: str | None = None,
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@ -73,14 +73,6 @@ class FlexibleArgumentParser(ArgumentParser):
        # Enable the deprecated kwarg for Python 3.12 and below
        def parse_known_args(self, args=None, namespace=None):
            if args is not None and "--disable-log-requests" in args:
                # Special case warning because the warning below won't trigger
                # if –-disable-log-requests because its value is default.
                logger.warning_once(
                    "argument '--disable-log-requests' is deprecated and "
                    "replaced with '--enable-log-requests'. This will be "
                    "removed in v0.12.0."
                )
            namespace, args = super().parse_known_args(args, namespace)
            for action in FlexibleArgumentParser._deprecated:
                if (
--- a/vllm/utils/hashing.py
+++ b/vllm/utils/hashing.py
@ -5,6 +5,7 @@ from __future__ import annotations
 import hashlib
 import pickle
 from _hashlib import HASH, UnsupportedDigestmodError
 from collections.abc import Callable
 from typing import Any
@ -61,3 +62,20 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
        return sha256_cbor
    raise ValueError(f"Unsupported hash function: {hash_fn_name}")
 def safe_hash(data: bytes, usedforsecurity: bool = True) -> HASH:
    """Hash for configs, defaulting to md5 but falling back to sha256
    in FIPS constrained environments.
    Args:
        data: bytes
        usedforsecurity: Whether the hash is used for security purposes
    Returns:
        Hash object
    """
    try:
        return hashlib.md5(data, usedforsecurity=usedforsecurity)
    except (UnsupportedDigestmodError, ValueError):
        return hashlib.sha256(data)
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]:
            os.environ[key] = old
@contextlib.contextmanager
 def suppress_stdout():
    """
    Suppress stdout from C libraries at the file descriptor level.
    Only suppresses stdout, not stderr, to preserve error messages.
    Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG.
    Example:
        with suppress_stdout():
            # C library calls that would normally print to stdout
            torch.distributed.new_group(ranks, backend="gloo")
    """
    # Don't suppress if logging level is DEBUG
    if envs.VLLM_LOGGING_LEVEL == "DEBUG":
        yield
        return
    stdout_fd = sys.stdout.fileno()
    stdout_dup = os.dup(stdout_fd)
    devnull_fd = os.open(os.devnull, os.O_WRONLY)
    try:
        sys.stdout.flush()
        os.dup2(devnull_fd, stdout_fd)
        yield
    finally:
        sys.stdout.flush()
        os.dup2(stdout_dup, stdout_fd)
        os.close(stdout_dup)
        os.close(devnull_fd)
 # File path utilities
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@ -13,6 +13,8 @@ from vllm.distributed.kv_events import (
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (
    BlockHash,
    BlockHashList,
    BlockHashListWithBlockSize,
    BlockHashWithGroupId,
    ExternalBlockHash,
    FreeKVCacheBlockQueue,
@ -133,6 +135,10 @@ class BlockPool:
    Args:
        num_gpu_blocks: The number of blocks in the pool.
        enable_caching: Whether to enable prefix caching.
        hash_block_size: The block size of which the block hashes are computed.
            The actual block size usually equals hash_block_size, but in cases
            where different KV cache groups have different block sizes, the
            actual block size can be a multiple of hash_block_size.
        enable_kv_cache_events: Whether to enable kv cache events.
    """
@ -140,11 +146,13 @@ class BlockPool:
        self,
        num_gpu_blocks: int,
        enable_caching: bool,
        hash_block_size: int,
        enable_kv_cache_events: bool = False,
    ):
        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
        self.num_gpu_blocks = num_gpu_blocks
        self.enable_caching = enable_caching
        self.hash_block_size = hash_block_size
        # All kv-cache blocks.
        self.blocks: list[KVCacheBlock] = [
            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
@ -223,8 +231,20 @@ class BlockPool:
            return
        new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
        assert len(request.block_hashes) >= num_full_blocks
-        new_block_hashes = request.block_hashes[num_cached_blocks:]
+        if block_size == self.hash_block_size:
            # Common case.
            block_hashes: BlockHashList = request.block_hashes
        else:
            # block_size is a multiple of hash_block_size. This happens when
            # different KV cache groups have different block sizes.
            assert block_size % self.hash_block_size == 0
            # Recalculate block_hashes at the granularity of block_size, using
            # the original block_hashes (at the granularity of hash_block_size).
            block_hashes = BlockHashListWithBlockSize(
                request.block_hashes, self.hash_block_size, block_size
            )
        new_block_hashes = block_hashes[num_cached_blocks:]
        new_hashes: list[ExternalBlockHash] | None = (
            [] if self.enable_kv_cache_events else None
        )
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@ -2,15 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from math import lcm
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (
    BlockHash,
    BlockHashList,
    BlockHashListWithBlockSize,
    KVCacheBlock,
 )
 from vllm.v1.core.single_type_kv_cache_manager import (
    CrossAttentionManager,
    FullAttentionManager,
    get_manager_for_kv_cache_spec,
 )
-from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec
+from vllm.v1.kv_cache_interface import (
    FullAttentionSpec,
    KVCacheConfig,
    KVCacheSpec,
 )
 from vllm.v1.request import Request
@ -28,13 +38,17 @@ class KVCacheCoordinator(ABC):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
        hash_block_size: int,
    ):
        self.kv_cache_config = kv_cache_config
        self.max_model_len = max_model_len
        self.enable_caching = enable_caching
        self.block_pool = BlockPool(
-            kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events
+            kv_cache_config.num_blocks,
            enable_caching,
            hash_block_size,
            enable_kv_cache_events,
        )
        # Needs special handling for find_longest_cache_hit if eagle is enabled
@ -213,6 +227,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
        hash_block_size: int,
    ):
        super().__init__(
            kv_cache_config,
@ -222,6 +237,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
            enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
            hash_block_size=hash_block_size,
        )
        self.num_single_type_manager = len(self.single_type_managers)
@ -255,6 +271,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
        hash_block_size: int,
    ):
        super().__init__(
            kv_cache_config,
@ -264,6 +281,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
            enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
            hash_block_size=hash_block_size,
        )
        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
        self.block_size = self.kv_cache_spec.block_size
@ -273,6 +291,11 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
            self.block_size *= dcp_world_size
        if pcp_world_size > 1:
            self.block_size *= pcp_world_size
        # For models using only Mamba, block_size is set to max_model_len when
        # prefix caching is disabled, and hash_block_size validation is skipped.
        assert not enable_caching or (hash_block_size == self.block_size), (
            "UnitaryKVCacheCoordinator assumes hash_block_size == block_size"
        )
        assert len(self.kv_cache_config.kv_cache_groups) == 1, (
            "UnitaryKVCacheCoordinator assumes only one kv cache group"
        )
@ -289,6 +312,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
            block_pool=self.block_pool,
            kv_cache_spec=self.kv_cache_spec,
            use_eagle=self.use_eagle,
            alignment_tokens=self.block_size,
            dcp_world_size=self.dcp_world_size,
            pcp_world_size=self.pcp_world_size,
        )
@ -313,6 +337,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
        enable_kv_cache_events: bool,
        dcp_world_size: int,
        pcp_world_size: int,
        hash_block_size: int,
    ):
        super().__init__(
            kv_cache_config,
@ -322,7 +347,17 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
            enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
            hash_block_size=hash_block_size,
        )
        # hash_block_size: the block size used to compute block hashes.
        # The actual block size usually equals hash_block_size, but in cases where
        # different KV cache groups have different block sizes, the actual block size
        # can be a multiple of hash_block_size.
        self.hash_block_size = hash_block_size
        assert all(
            g.kv_cache_spec.block_size % hash_block_size == 0
            for g in kv_cache_config.kv_cache_groups
        ), "block_size must be divisible by hash_block_size"
        assert dcp_world_size == 1, "DCP not support hybrid attn now."
        assert pcp_world_size == 1, "PCP not support hybrid attn now."
        self.verify_and_split_kv_cache_groups()
@ -373,14 +408,12 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
        self.other_spec = other_spec
        self.full_attention_block_size = self.full_attention_spec.block_size
        self.other_block_size = self.other_spec.block_size
-
+        # The LCM of the block sizes of full attention and other attention.
-        if self.enable_caching:
+        # The cache hit length must be a multiple of the LCM of the block sizes
-            # this requirement is only needed for the prefix caching logic
+        # to make sure the cache hit length is a multiple of the block size of
-            divisible = self.other_block_size % self.full_attention_block_size
+        # each attention type. Requiring this because we don't support partial
-            assert divisible == 0, (
+        # block cache hit yet.
-                "KVCacheCoordinator assumes the block_size of full "
+        self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size)
                "attention layers is divisible by other layers now."
            )
        if max(self.full_attention_group_ids) < min(self.other_group_ids):
            self.full_attn_first = True
@ -414,25 +447,48 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
                - The number of tokens of the longest cache hit.
        """
        # First, find the longest cache hit for full attention.
        if self.full_attention_spec.block_size == self.hash_block_size:
            # Common case.
            full_attention_block_hashes: BlockHashList = block_hashes
        else:
            # block_size is a multiple of hash_block_size. This happens when different
            # KV cache groups have different block sizes. In this case, we need to
            # recalculate block_hashes at the granularity of block_size, using the
            # original block_hashes (at the granularity of hash_block_size).
            full_attention_block_hashes = BlockHashListWithBlockSize(
                block_hashes, self.hash_block_size, self.full_attention_spec.block_size
            )
        hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=full_attention_block_hashes,
            max_length=max_cache_hit_length,
            kv_cache_group_ids=self.full_attention_group_ids,
            block_pool=self.block_pool,
            kv_cache_spec=self.full_attention_spec,
            use_eagle=self.use_eagle,
            alignment_tokens=self.lcm_block_size,
        )
        hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size
        # Next, find the cache hit for the other attention WITHIN
        # the cache hit of full attention.
        if self.other_spec.block_size == self.hash_block_size:
            # Common case.
            other_block_hashes: BlockHashList = block_hashes
        else:
            # Similar to the full attention case, here we need to recalculate
            # block_hashes at the granularity of block_size, using the original
            # block_hashes (at the granularity of hash_block_size).
            other_block_hashes = BlockHashListWithBlockSize(
                block_hashes, self.hash_block_size, self.other_spec.block_size
            )
        hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=other_block_hashes,
            max_length=hit_length,
            kv_cache_group_ids=self.other_group_ids,
            block_pool=self.block_pool,
            kv_cache_spec=self.other_spec,
            use_eagle=self.use_eagle,
            alignment_tokens=self.lcm_block_size,
        )
        hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
@ -466,6 +522,7 @@ def get_kv_cache_coordinator(
    enable_kv_cache_events: bool,
    dcp_world_size: int,
    pcp_world_size: int,
    hash_block_size: int,
 ) -> KVCacheCoordinator:
    if not enable_caching:
        return KVCacheCoordinatorNoPrefixCache(
@ -473,8 +530,9 @@ def get_kv_cache_coordinator(
            max_model_len,
            use_eagle,
            enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
+            dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            pcp_world_size,
            hash_block_size,
        )
    if len(kv_cache_config.kv_cache_groups) == 1:
        return UnitaryKVCacheCoordinator(
@ -483,8 +541,9 @@ def get_kv_cache_coordinator(
            use_eagle,
            enable_caching,
            enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
+            dcp_world_size,
-            pcp_world_size=pcp_world_size,
+            pcp_world_size,
            hash_block_size,
        )
    return HybridKVCacheCoordinator(
        kv_cache_config,
@ -492,6 +551,7 @@ def get_kv_cache_coordinator(
        use_eagle,
        enable_caching,
        enable_kv_cache_events,
-        dcp_world_size=dcp_world_size,
+        dcp_world_size,
-        pcp_world_size=pcp_world_size,
+        pcp_world_size,
        hash_block_size,
    )
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@ -95,6 +95,7 @@ class KVCacheManager:
        self,
        kv_cache_config: KVCacheConfig,
        max_model_len: int,
        hash_block_size: int,
        enable_caching: bool = True,
        use_eagle: bool = False,
        log_stats: bool = False,
@ -107,28 +108,11 @@ class KVCacheManager:
        self.enable_caching = enable_caching
        self.use_eagle = use_eagle
        self.log_stats = log_stats
-        # FIXME: make prefix cache stats conditional on log_stats
+        # FIXME: make prefix cache stats conditional on log_stats. We still need
        # this comment because when the log stats is enabled there are still
        # potential configs we could expose in the future.
        self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
        self.block_size: int | None = None
        if self.enable_caching:
            assert (
                len(
                    set(
                        g.kv_cache_spec.block_size
                        for g in kv_cache_config.kv_cache_groups
                    )
                )
                == 1
            ), "Only one block size is supported for now"
            self.block_size = kv_cache_config.kv_cache_groups[
                0
            ].kv_cache_spec.block_size
            if dcp_world_size * pcp_world_size > 1:
                assert len(kv_cache_config.kv_cache_groups) == 1
                self.block_size *= dcp_world_size * pcp_world_size
        self.coordinator = get_kv_cache_coordinator(
            kv_cache_config=kv_cache_config,
            max_model_len=self.max_model_len,
@ -137,6 +121,7 @@ class KVCacheManager:
            enable_kv_cache_events=enable_kv_cache_events,
            dcp_world_size=dcp_world_size,
            pcp_world_size=pcp_world_size,
            hash_block_size=hash_block_size,
        )
        self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
        self.block_pool = self.coordinator.block_pool
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -5,9 +5,9 @@
 import copy
 import os
 from collections import defaultdict
-from collections.abc import Callable, Iterable, Sequence
+from collections.abc import Callable, Iterable, Iterator, Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
-from typing import Any, NewType, TypeAlias
+from typing import Any, NewType, TypeAlias, overload
 from vllm import envs
 from vllm.config import VllmConfig
@ -825,11 +825,11 @@ def get_num_blocks(
    return num_blocks
-def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
    """
    Get the page size of the KV cache.
    """
-    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_specs}
    assert len(page_sizes) == 1
    return page_sizes.pop()
@ -882,6 +882,46 @@ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool
    return len(page_sizes) == 1
 def unify_kv_cache_spec_page_size(
    kv_cache_spec: dict[str, KVCacheSpec],
 ) -> dict[str, KVCacheSpec]:
    """
    Unify the page size of the given KVCacheSpec. If the page size of all layers
    are the same, return the original KVCacheSpec. If not same, unify the page
    size by increasing the block size of layers with smaller page size. Raise
    NotImplementedError if failed to unify the page size.
    Args:
        kv_cache_spec: The KVCacheSpec of each attention layer in the model
    Returns:
        The updated KVCacheSpec with the same page_size_bytes.
    """
    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
    if len(page_sizes) <= 1:
        # All layers have the same page size, no need to unify.
        return kv_cache_spec
    max_page_size = max(page_sizes)
    new_kv_cache_spec = {}
    for layer_name, layer_spec in kv_cache_spec.items():
        if layer_spec.page_size_bytes == max_page_size:
            new_kv_cache_spec[layer_name] = layer_spec
        else:
            layer_page_size = layer_spec.page_size_bytes
            if max_page_size % layer_page_size != 0:
                raise NotImplementedError(
                    "The page size of the layer is not divisible by the "
                    "maximum page size. Cannot unify by adjusting block_size."
                )
            ratio = max_page_size // layer_page_size
            new_block_size = layer_spec.block_size * ratio
            new_spec = replace(layer_spec, block_size=new_block_size)
            assert new_spec.page_size_bytes == max_page_size
            new_kv_cache_spec[layer_name] = new_spec
    return new_kv_cache_spec
 def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
    # kv_cache_spec is an empty dict for attention free models
    return not kv_cache_spec
@ -1010,7 +1050,6 @@ def _get_kv_cache_groups_uniform_page_size(
 def get_kv_cache_config_from_groups(
    vllm_config: VllmConfig,
    kv_cache_groups: list[KVCacheGroupSpec],
    kv_cache_specs: dict[str, KVCacheSpec],
    available_memory: int,
 ) -> KVCacheConfig:
    """
@ -1020,7 +1059,6 @@ def get_kv_cache_config_from_groups(
    Args:
        vllm_config: The global VllmConfig
        kv_cache_groups: The KV cache groups
        kv_cache_specs: The KV cache spec of each attention layer in the model
        available_memory: Memory available for KV cache in bytes
    Returns:
        The generated KVCacheConfig
@ -1064,7 +1102,9 @@ def get_kv_cache_config_from_groups(
        # full.1, sw.2: share another Tensor with size=available_memory//2
        group_size = max(len(group.layer_names) for group in kv_cache_groups)
-        page_size = get_uniform_page_size(kv_cache_specs)
+        page_size = get_uniform_page_size(
            [group.kv_cache_spec for group in kv_cache_groups]
        )
        assert group_size > 0, "group_size must be greater than 0"
        num_blocks = get_num_blocks(
            vllm_config, group_size, available_memory, page_size
@ -1166,7 +1206,8 @@ def get_kv_cache_groups(
        # This returns an empty list to allow for the KVCacheManager to handle
        # attention free models.
        return []
-    elif is_kv_cache_spec_uniform(kv_cache_spec):
+
    if is_kv_cache_spec_uniform(kv_cache_spec):
        # KV cache of all layers are the same, which is true for
        # most models. Allocate the same amount of memory for
        # each layer.
@ -1176,14 +1217,16 @@ def get_kv_cache_groups(
        # full attention, or all layers are sliding window attention with the
        # same window size). Put all layers into one group.
        return _get_kv_cache_groups_uniform_type(uniform_spec)
    elif is_kv_cache_page_size_uniform(kv_cache_spec):
        # Model contains multiple attention types, but KV cache of all layers
        # have the same physical memory per block per layer. Split the layers
        # into groups with the same number of layers, and thus same total page
        # size.
        return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
-    raise NotImplementedError
+    # As KVCacheManager can only allocate memory of one size, we need to unify
    # the page size of the layers. For cases cannot be unified, this function
    # will raise an error.
    kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec)
    # Model contains multiple attention types, but KV cache of all layers
    # have the same physical memory per block per layer. Split the layers
    # into groups with the same number of layers, and thus same total page
    # size.
    return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 def generate_scheduler_kv_cache_config(
@ -1327,10 +1370,7 @@ def get_kv_cache_configs(
        ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
        kv_cache_configs.append(
            get_kv_cache_config_from_groups(
-                vllm_config,
+                vllm_config, kv_cache_groups_one_worker, available_memory_one_worker
                kv_cache_groups_one_worker,
                kv_cache_spec_one_worker,
                available_memory_one_worker,
            )
        )
@ -1353,3 +1393,79 @@ def get_kv_cache_configs(
            _report_kv_cache_config(vllm_config, kv_cache_config)
    return kv_cache_configs
 class BlockHashListWithBlockSize:
    """
    Convert block-hash granularity from `hash_block_size` to `target_block_size`.
    Used when KV cache groups have different block sizes: `hash_block_size`
    is the size used to compute the original `block_hashes`; `target_block_size`
    is the group's actual block size.
    Currently, only scaling up by an integer factor is supported (i.e.,
    `target_block_size` is a multiple of `hash_block_size`). Conversion is
    performed lazily on access for efficiency, by concatenating consecutive
    hashes at `hash_block_size` to form each hash at `target_block_size`.
    Example (`hash_block_size` = 16, `target_block_size` = 32):
    concatenating two 16-size hashes yields one 32-size hash:
    Block hashes with block_size 16:
    | Token Range | 0-15 | 16-31 | 32-47 | 48-63 |
    |-------------|------|-------|-------|-------|
    | Hash        | A    | B     | C     | D     |
    Block hashes with block_size 32:
    | Token Range | 0-31 | 32-63 |
    |-------------|------|-------|
    | Hash        | AB   | CD    |
    Args:
        block_hashes: Block hashes to convert, computed at `hash_block_size`.
        hash_block_size: Block size at which `block_hashes` were computed.
        target_block_size: Desired block size; must be a multiple of `hash_block_size`.
    """
    def __init__(
        self,
        block_hashes: list[BlockHash],
        hash_block_size: int,
        target_block_size: int,
    ):
        self.block_hashes = block_hashes
        assert target_block_size % hash_block_size == 0
        self.scale_factor = target_block_size // hash_block_size
    def __len__(self) -> int:
        return len(self.block_hashes) // self.scale_factor
    @overload
    def __getitem__(self, idx: int) -> BlockHash: ...
    @overload
    def __getitem__(self, idx: slice) -> list[BlockHash]: ...
    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self._get_value_at(idx)
        if isinstance(idx, slice):
            start, stop, step = idx.indices(len(self))
            return [self._get_value_at(i) for i in range(start, stop, step)]
        raise TypeError(f"Invalid index type: {type(idx)!r}")
    def __iter__(self) -> Iterator[BlockHash]:
        for i in range(len(self)):
            yield self._get_value_at(i)
    def _get_value_at(self, idx: int) -> BlockHash:
        base = idx * self.scale_factor
        end = base + self.scale_factor
        merged_hash: bytes = self.block_hashes[base]
        for i in range(base + 1, end):
            merged_hash += self.block_hashes[i]
        return BlockHash(merged_hash)
 BlockHashList = list[BlockHash] | BlockHashListWithBlockSize
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@ -186,6 +186,7 @@ class Scheduler(SchedulerInterface):
            enable_kv_cache_events=self.enable_kv_cache_events,
            dcp_world_size=self.dcp_world_size,
            pcp_world_size=self.pcp_world_size,
            hash_block_size=self.block_size,
        )
        sink_len = getattr(vllm_config.model_config.hf_config, "param_sink_number", 0)
        if sink_len > 0:
@ -1093,8 +1094,6 @@ class Scheduler(SchedulerInterface):
                and request.sampling_params.logprobs is not None
                and logprobs
            ):
                # NOTE: once we support N tokens per step (spec decode),
                # the outer lists can be of length > 1.
                new_logprobs = logprobs.slice(req_index, req_index + 1)
            if new_token_ids and self.structured_output_manager.should_advance(request):
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@ -7,7 +7,7 @@ from collections.abc import Sequence
 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock
 from vllm.v1.kv_cache_interface import (
    ChunkedLocalAttentionSpec,
    CrossAttentionSpec,
@ -208,12 +208,13 @@ class SingleTypeKVCacheManager(ABC):
    @abstractmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -233,6 +234,11 @@ class SingleTypeKVCacheManager(ABC):
            block_pool: The block pool.
            kv_cache_spec: The kv cache spec.
            use_eagle: Whether to use eagle.
            alignment_tokens: The returned cache hit length (in tokens) should
                be a multiple of this value (in tokens). By default, it should
                be set to the block_size.
            dcp_world_size: The world size of decode context parallelism.
            pcp_world_size: The world size of prefill context parallelism.
        Returns:
            A list of cached blocks with skipped blocks replaced by null block
@ -300,18 +306,19 @@ class FullAttentionManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        assert isinstance(
            kv_cache_spec,
-            (FullAttentionSpec, FullDiffkvAttentionSpec, ChunkedLocalAttentionSpec),
+            FullAttentionSpec | ChunkedLocalAttentionSpec | FullDiffkvAttentionSpec
        ), (
            "FullAttentionManager can only be used for full attention "
            "and chunked local attention groups"
@ -335,6 +342,13 @@ class FullAttentionManager(SingleTypeKVCacheManager):
            else:
                break
        if use_eagle and computed_blocks[0]:
            # Need to drop the last matched block if eagle is enabled.
            for computed in computed_blocks:
                computed.pop()
        while (
            block_size != alignment_tokens  # Faster for common case.
            and len(computed_blocks[0]) * block_size % alignment_tokens != 0
        ):
            for computed in computed_blocks:
                computed.pop()
        return computed_blocks
@ -361,12 +375,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -398,6 +413,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
            [block_pool.null_block] * max_num_blocks
            for _ in range(len(kv_cache_group_ids))
        )
        block_size = kv_cache_spec.block_size
        num_contiguous_blocks = 0
        match_found = False
        # Search from right to left and early stop when a match is found.
@ -405,6 +421,15 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
            if cached_block := block_pool.get_cached_block(
                block_hashes[i], kv_cache_group_ids
            ):
                # Skip prefix matching check if the block is not aligned with
                # `alignment_tokens`.
                if (
                    num_contiguous_blocks == 0
                    and block_size != alignment_tokens  # Faster for common case.
                    and (i + 1) * block_size % alignment_tokens != 0
                ):
                    continue
                # Add the cached block to the computed blocks.
                for computed, cached in zip(computed_blocks, cached_block):
                    computed[i] = cached
                num_contiguous_blocks += 1
@ -423,7 +448,16 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
            # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
            for computed in computed_blocks:
                del computed[num_contiguous_blocks:]
            while (
                block_size != alignment_tokens  # Faster for common case.
                and len(computed_blocks[0]) * block_size % alignment_tokens != 0
            ):
                for computed in computed_blocks:
                    computed.pop()
        if use_eagle and computed_blocks[0]:
            assert kv_cache_spec.block_size == alignment_tokens, (
                "aligned_length is not compatible with eagle now"
            )
            for computed in computed_blocks:
                computed.pop()
        return computed_blocks
@ -477,12 +511,13 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -513,6 +548,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
            block_pool: The block pool.
            kv_cache_spec: The kv cache spec.
            use_eagle: Whether to use eagle.
            dcp_world_size: The world size of decode context parallelism.
            pcp_world_size: The world size of prefill context parallelism.
            alignment_tokens: The returned cache hit length (in tokens) should
                be a multiple of this value (in tokens).
        Returns:
            A list of cached blocks
@ -526,6 +565,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
        )
        assert dcp_world_size == 1, "DCP not support chunked local attn now."
        assert pcp_world_size == 1, "PCP not support chunked local attn now."
        assert kv_cache_spec.block_size == alignment_tokens, (
            "KV cache groups with different block sizes are not compatible with "
            "chunked local attention now"
        )
        max_num_blocks = max_length // kv_cache_spec.block_size
        if max_length > 0:
            local_attention_start_idx = (
@ -614,12 +657,13 @@ class MambaManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
@ -632,12 +676,21 @@ class MambaManager(SingleTypeKVCacheManager):
            [] for _ in range(len(kv_cache_group_ids))
        )
-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
        max_num_blocks = max_length // block_size
        # Search from right to left and early stop when a match is found.
        for i in range(max_num_blocks - 1, -1, -1):
            if cached_block := block_pool.get_cached_block(
                block_hashes[i], kv_cache_group_ids
            ):
                # When enable Mamba prefix caching, `block_size` will be aligned
                # across full attention layers and Mamba layers to ensure the
                # prefix hit length aligned at block
                if (
                    block_size != alignment_tokens  # Faster for common case.
                    and (i + 1) * block_size % alignment_tokens != 0
                ):
                    continue
                for computed, cached in zip(computed_blocks, cached_block):
                    # the hit length logic later assumes:
                    #  hit_length = len(hit_blocks_other_attn[0])
@ -710,12 +763,13 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
    @classmethod
    def find_longest_cache_hit(
        cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
        max_length: int,
        kv_cache_group_ids: list[int],
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
        alignment_tokens: int,
        dcp_world_size: int = 1,
        pcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -31,7 +31,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
 from vllm.utils.func_utils import deprecate_kwargs
 from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
@ -195,12 +194,6 @@ class AsyncLLM(EngineClient):
            self.profiler = None
    @classmethod
    @deprecate_kwargs(
        "disable_log_requests",
        additional_message=(
            "This argument will have no effect. Use `enable_log_requests` instead."
        ),
    )
    def from_vllm_config(
        cls,
        vllm_config: VllmConfig,
@ -213,7 +206,6 @@ class AsyncLLM(EngineClient):
        client_addresses: dict[str, str] | None = None,
        client_count: int = 1,
        client_index: int = 0,
        disable_log_requests: bool = True,  # Deprecated, will be removed
    ) -> "AsyncLLM":
        # Create the LLMEngine.
        return cls(
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from dataclasses import replace
 import torch
@ -204,7 +205,9 @@ class RejectionSampler(nn.Module):
    def parse_output(
        output_token_ids: torch.Tensor,
        vocab_size: int,
-    ) -> list[list[int]]:
+        discard_req_indices: Sequence[int] = (),
        return_cu_num_tokens: bool = False,
    ) -> tuple[list[list[int]], list[int] | None]:
        """Parse the output of the rejection sampler.
        Args:
            output_token_ids: The sampled token IDs in shape
@ -212,6 +215,8 @@ class RejectionSampler(nn.Module):
                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                and will be filtered out in this function.
            vocab_size: The size of the vocabulary.
            discard_req_indices: Optional row indices to discard tokens in.
            return_cu_num_tokens: Whether to also return cumulative token counts.
        Returns:
            A list of lists of token IDs.
        """
@ -220,10 +225,15 @@ class RejectionSampler(nn.Module):
        valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
            output_token_ids_np < vocab_size
        )
        cu_num_tokens = None
        if return_cu_num_tokens:
            cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist()
        if len(discard_req_indices) > 0:
            valid_mask[discard_req_indices] = False
        outputs = [
            row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
        ]
-        return outputs
+        return outputs, cu_num_tokens
    def apply_logits_processors(
        self,
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -1055,11 +1055,11 @@ class EagleProposer:
                elif (
                    isinstance(target_embed_tokens.weight, torch.Tensor)
                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
-                    and torch.allclose(
+                    # TODO: Offload to CPU for comparison to avoid extra GPU memory
                    # usage in CI testing environments with limited GPU memory
                    and torch.equal(
                        target_embed_tokens.weight.cpu(),
                        self.model.model.embed_tokens.weight.cpu(),
                        rtol=1e-5,
                        atol=1e-7,
                    )
                ):
                    share_embeddings = True
@ -1105,8 +1105,11 @@ class EagleProposer:
                hasattr(target_language_model, "lm_head")
                and isinstance(target_language_model.lm_head.weight, torch.Tensor)
                and isinstance(self.model.lm_head.weight, torch.Tensor)
                # TODO: Offload to CPU for comparison to avoid extra GPU memory
                # usage in CI testing environments with limited GPU memory
                and torch.equal(
-                    target_language_model.lm_head.weight, self.model.lm_head.weight
+                    target_language_model.lm_head.weight.cpu(),
                    self.model.lm_head.weight.cpu(),
                )
            ):
                share_lm_head = True
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -186,7 +186,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
        self,
        model_runner_output: ModelRunnerOutput,
        sampled_token_ids: torch.Tensor,
-        logprobs_tensors: torch.Tensor | None,
+        logprobs_tensors: LogprobsTensors | None,
        invalid_req_indices: list[int],
        async_output_copy_stream: torch.cuda.Stream,
        vocab_size: int,
@ -222,28 +222,29 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
        This function blocks until the copy is finished.
        """
        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
        self.async_copy_ready_event.synchronize()
        # Release the device tensors once the copy has completed.
        del self._logprobs_tensors
        del self._sampled_token_ids
        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
        if max_gen_len == 1:
            valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
            for i in self._invalid_req_indices:
                valid_sampled_token_ids[i].clear()
            cu_num_tokens = None
        else:
-            valid_sampled_token_ids = RejectionSampler.parse_output(
+            valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                self.sampled_token_ids_cpu,
                self.vocab_size,
                self._invalid_req_indices,
                return_cu_num_tokens=self._logprobs_tensors_cpu is not None,
            )
        for i in self._invalid_req_indices:
            valid_sampled_token_ids[i].clear()
        output = self._model_runner_output
        output.sampled_token_ids = valid_sampled_token_ids
        if self._logprobs_tensors_cpu:
-            # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens
+            output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens)
            # for async sched + spec decode + logprobs compatibility.
            output.logprobs = self._logprobs_tensors_cpu.tolists()
        return output
@ -2629,28 +2630,24 @@ class GPUModelRunner(
        sampled_token_ids = sampler_output.sampled_token_ids
        logprobs_tensors = sampler_output.logprobs_tensors
        invalid_req_indices = []
-        cu_num_new_tokens: list[int] | None = None
+        cu_num_tokens: list[int] | None = None
        if not self.use_async_scheduling:
            # Get the valid generated tokens.
            max_gen_len = sampled_token_ids.shape[-1]
            if max_gen_len == 1:
                # No spec decode tokens.
                valid_sampled_token_ids = self._to_list(sampled_token_ids)
                # Mask out the sampled tokens that should not be sampled.
                for i in discard_sampled_tokens_req_indices:
                    valid_sampled_token_ids[int(i)].clear()
            else:
                # Includes spec decode tokens.
-                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                    sampled_token_ids,
                    self.input_batch.vocab_size,
                    discard_sampled_tokens_req_indices,
                    return_cu_num_tokens=logprobs_tensors is not None,
                )
                if logprobs_tensors:
                    # Needed for extracting logprobs when spec decoding.
                    # This must be done prior to discarding sampled tokens.
                    cu_num_new_tokens = [0]
                    for toks in valid_sampled_token_ids:
                        cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks))
            # Mask out the sampled tokens that should not be sampled.
            for i in discard_sampled_tokens_req_indices:
                valid_sampled_token_ids[int(i)].clear()
        else:
            valid_sampled_token_ids = []
            invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
@ -2704,7 +2701,7 @@ class GPUModelRunner(
            req_state.output_token_ids.extend(sampled_ids)
        logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_new_tokens)
+            logprobs_tensors.tolists(cu_num_tokens)
            if not self.use_async_scheduling and logprobs_tensors is not None
            else None
        )
@ -2824,7 +2821,7 @@ class GPUModelRunner(
                        # returns True. before returning early here we call
                        # dummy run to ensure coordinate_batch_across_dp
                        # is called into to avoid out of sync issues.
-                        self._dummy_run(1)
+                        self._dummy_run(self._get_num_input_tokens(1))
                    if not has_kv_transfer_group():
                        # Return empty ModelRunnerOutput if no work to do.
                        return EMPTY_MODEL_RUNNER_OUTPUT
@ -3495,6 +3492,10 @@ class GPUModelRunner(
            scope="local",
        )
        prepare_communication_buffer_for_model(self.model)
        if (drafter := getattr(self, "drafter", None)) and (
            drafter_model := getattr(drafter, "model", None)
        ):
            prepare_communication_buffer_for_model(drafter_model)
        mm_config = self.model_config.multimodal_config
        self.is_multimodal_pruning_enabled = (
            supports_multimodal_pruning(self.get_model())
@ -4277,14 +4278,18 @@ class GPUModelRunner(
                    # NOTE: This happens when encoder cache needs to store
                    # the embeddings that encoder outputs are scattered onto.
                    # In this case we create dummy embeddings of size
-                    # (encode_budget, hidden_size) and scatter encoder
+                    # (max_tokens_for_modality, hidden_size) and scatter
-                    # output into it.
+                    # encoder output into it.
                    encoder_output_shape = dummy_encoder_outputs[0].shape
-                    if encoder_output_shape[0] < encoder_budget:
+                    max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
                        dummy_modality
                    ]
                    if encoder_output_shape[0] < max_mm_tokens_per_item:
                        encoder_hidden_size = encoder_output_shape[-1]
                        expanded_outputs = []
                        for output in dummy_encoder_outputs:
                            expanded = output.new_zeros(
-                                (encoder_budget, encoder_output_shape[-1])
+                                (max_mm_tokens_per_item, encoder_hidden_size)
                            )
                            num_tokens = output.shape[0]
                            expanded[:num_tokens].copy_(output)